In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import sys
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from clean_data import clean_text

# Get the current working directory
current_dir = os.getcwd()

# Adjust the path to the project root (assuming the notebook is two levels deep in the folder structure)
project_root = os.path.abspath(os.path.join(current_dir, '..', '..'))

# Add the project root to the Python path
sys.path.append(project_root)

# Load the dataset
csv_path = os.path.abspath(os.path.join(current_dir, '..', '..', 'datasets', 'trainEN.csv'))
df = pd.read_csv(csv_path)

# Combine columns of title and text
df['combined_text'] = df['title'] + " " + df['text']
df['clean_text'] = df['combined_text'].apply(clean_text)

# Vectorize the dataset
X = df['clean_text']
y = df['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and fit the vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, min_df=3, max_df=0.7, ngram_range=(1, 2))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Initialize the SVC model and the PCA transformer
clf = SVC(C=19.0011, gamma=1.50001, kernel='rbf', probability=True)
pca = TruncatedSVD(n_components=3)  # We will use 3 components for 3D visualization

# Transform the training and test data
X_train_pca = pca.fit_transform(X_train_vectorized)
X_test_pca = pca.transform(X_test_vectorized)

# Fit the model on the transformed training data
clf.fit(X_train_pca, y_train)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MCA01\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MCA01\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:

# Apply t-SNE for visualization with 3 components
perplexity = 50  # Adjust this value as needed
X_train_embedded = TSNE(n_components=3, perplexity=perplexity, random_state=42).fit_transform(X_train_pca)

# Create a 3D plot
fig = go.Figure()

# Add the points to the plot
fig.add_trace(go.Scatter3d(
    x=X_train_embedded[:, 0], 
    y=X_train_embedded[:, 1], 
    z=X_train_embedded[:, 2],
    mode='markers',
    marker=dict(
        size=4,
        color=y_train,  # Color by class label
        colorscale=['black', 'white'],  # Black for 0 and White for 1
        opacity=0.8
    )
))

# Generate a grid for the decision boundary
xx, yy, zz = np.meshgrid(np.linspace(X_train_embedded[:, 0].min(), X_train_embedded[:, 0].max(), 50),
                         np.linspace(X_train_embedded[:, 1].min(), X_train_embedded[:, 1].max(), 50),
                         np.linspace(X_train_embedded[:, 2].min(), X_train_embedded[:, 2].max(), 50))

grid = np.c_[xx.ravel(), yy.ravel(), zz.ravel()]
decision_function = clf.predict_proba(grid)[:, 1]  # Use probabilities for decision boundary
decision_function = decision_function.reshape(xx.shape)

fig.add_trace(go.Volume(
    x=xx.flatten(),
    y=yy.flatten(),
    z=zz.flatten(),
    value=decision_function.flatten(),
    isomin=0.5,  # Threshold for decision boundary
    isomax=0.5,
    opacity=0.1,  # Adjust the opacity for better visualization
    surface_count=1,  # Number of isosurfaces
    colorscale='Greys'
))

# Fit a plane to the points
A = np.c_[X_train_embedded[:, 0], X_train_embedded[:, 1], np.ones(X_train_embedded.shape[0])]
C,_,_,_ = np.linalg.lstsq(A, X_train_embedded[:, 2], rcond=None)

# Create a grid for the plane
x_plane, y_plane = np.meshgrid(np.linspace(X_train_embedded[:, 0].min(), X_train_embedded[:, 0].max(), 50),
                               np.linspace(X_train_embedded[:, 1].min(), X_train_embedded[:, 1].max(), 50))
z_plane = C[0] * x_plane + C[1] * y_plane + C[2]

# Add the plane to the plot
fig.add_trace(go.Surface(x=x_plane, y=y_plane, z=z_plane, colorscale='Viridis', opacity=0.5))

# Update layout for better view
fig.update_layout(scene=dict(
    xaxis_title='Component 1',
    yaxis_title='Component 2',
    zaxis_title='Component 3'
), title='3D Hyperplane of SVM on t-SNE Transformed Data with Best Fit Plane')

# Save the figure as a JSON file
with open("plotly_figure_with_plane.json", "w") as f:
    f.write(fig.to_json())


In [3]:
#import os
#!pip install nbformat 
import plotly.graph_objects as go
import json

# Save the figure as a JSON file
with open("plotly_figure2.json", "w") as f:
    f.write(fig.to_json())
