In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('reviews.tsv', sep='\t')

# Preprocessing (if needed)
# For simplicity, let's assume the data is clean and doesn't require preprocessing.

# Feature Extraction
X = df['Review']
y = df['Label']

# Bag of Words (BoW)
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(X)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Word2Vec
sentences = [review.split() for review in X]
word2vec_model = Word2Vec(sentences, min_count=1)

def word_averaging(wv, words):
    all_words, mean = set(), []
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.wv.vocab:
            mean.append(wv[word])
            all_words.add(wv.vocab[word].index)

    if not mean:
        return np.zeros(wv.vector_size,)
    mean = np.array(mean).mean(axis=0)
    return mean

def word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list])

X_word2vec = word_averaging_list(word2vec_model, sentences)

# Splitting the dataset
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train_word2vec, X_test_word2vec, _, _ = train_test_split(X_word2vec, y, test_size=0.2, random_state=42)

# Model Training
# Naive Bayes
nb_classifier_bow = MultinomialNB()
nb_classifier_bow.fit(X_train_bow, y_train)

nb_classifier_tfidf = MultinomialNB()
nb_classifier_tfidf.fit(X_train_tfidf, y_train)

# KNN
knn_classifier_word2vec = KNeighborsClassifier(n_neighbors=5)
knn_classifier_word2vec.fit(X_train_word2vec, y_train)

# Model Evaluation
# Naive Bayes
nb_pred_bow = nb_classifier_bow.predict(X_test_bow)
nb_pred_tfidf = nb_classifier_tfidf.predict(X_test_tfidf)

# KNN
knn_pred_word2vec = knn_classifier_word2vec.predict(X_test_word2vec)

print("Naive Bayes (BoW) Accuracy:", accuracy_score(y_test, nb_pred_bow))
print("Naive Bayes (TF-IDF) Accuracy:", accuracy_score(y_test, nb_pred_tfidf))
print("KNN (Word2Vec) Accuracy:", accuracy_score(y_test, knn_pred_word2vec))

print("\nClassification Report for Naive Bayes (BoW):\n", classification_report(y_test, nb_pred_bow))
print("\nClassification Report for Naive Bayes (TF-IDF):\n", classification_report(y_test, nb_pred_tfidf))
print("\nClassification Report for KNN (Word2Vec):\n", classification_report(y_test, knn_pred_word2vec))


In [None]:
from gensim.models import Word2Vec
import numpy as np

# Load the trained Word2Vec model
w2v = Word2Vec.load('./trained-word2vec-studentReviews.model')

def sentence_to_vector(sentence, model):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if not word_vectors:  # If the sentence contains no words in the model's vocabulary
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Create vectors for all sentences
X_vectors = np.array([sentence_to_vector(sentence, w2v) for sentence in X])


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Initialize the Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model
gnb.fit(X_train_vecs, Y_train)

# Predict on the test set
Y_pred = gnb.predict(X_test_vecs)

# Evaluate the model
accuracy = accuracy_score(Y_test, Y_pred)
print(f'Accuracy: {accuracy:.2f}')


Including graphs in your Jupyter Notebook to show the analysis and comparison between various algorithms is a great way to visualize the performance of different models. You can use libraries like matplotlib and seaborn to create these visualizations. Here is how you can add such graphs:

Plot Accuracy Comparison:

Create a bar plot to compare the accuracy of different models.
Confusion Matrix:

Plot confusion matrices to show the performance of each classifier in more detail.
ROC Curves:

Plot ROC curves to compare the true positive rates and false positive rates of the classifiers.
Example Code for Visualizations
First, ensure you have the necessary libraries installed:

bash
Copy code
pip install matplotlib seaborn scikit-learn
Then, you can use the following code to include the visualizations:

python
Copy code
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc

# Assuming you have already calculated these accuracy scores
accuracy_scores = {
    'Naive Bayes (BoW)': accuracy_score(Y_test, nb_pred_bow),
    'Naive Bayes (TF-IDF)': accuracy_score(Y_test, nb_pred_tfidf),
    'Naive Bayes (Word2Vec)': accuracy_score(Y_test, nb_pred_w2v),
    'KNN (BoW)': accuracy_score(Y_test, knn_pred_bow),
    'KNN (TF-IDF)': accuracy_score(Y_test, knn_pred_tfidf),
    'KNN (Word2Vec)': accuracy_score(Y_test, knn_pred_w2v),
    # Include other models if you have added them, e.g., SVM, Random Forest
}

# Plotting the accuracy scores
plt.figure(figsize=(10, 6))
sns.barplot(x=list(accuracy_scores.keys()), y=list(accuracy_scores.values()))
plt.xlabel('Model')
plt.ylabel('Accuracy Score')
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45)
plt.show()

# Plotting confusion matrices
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    plt.show()

# Plot confusion matrices for each model
plot_confusion_matrix(Y_test, nb_pred_bow, 'Confusion Matrix: Naive Bayes (BoW)')
plot_confusion_matrix(Y_test, nb_pred_tfidf, 'Confusion Matrix: Naive Bayes (TF-IDF)')
plot_confusion_matrix(Y_test, nb_pred_w2v, 'Confusion Matrix: Naive Bayes (Word2Vec)')
plot_confusion_matrix(Y_test, knn_pred_bow, 'Confusion Matrix: KNN (BoW)')
plot_confusion_matrix(Y_test, knn_pred_tfidf, 'Confusion Matrix: KNN (TF-IDF)')
plot_confusion_matrix(Y_test, knn_pred_w2v, 'Confusion Matrix: KNN (Word2Vec)')

# Plotting ROC curves
def plot_roc_curve(y_true, y_pred_proba, title):
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show()

# For ROC curves, you need predicted probabilities, not labels
# Assuming the classifiers support the `predict_proba` method
# If not, use `decision_function` method for classifiers like SVM

# Plot ROC curves for each model
plot_roc_curve(Y_test, nb_bow.predict_proba(X_test_bow)[:, 1], 'ROC Curve: Naive Bayes (BoW)')
plot_roc_curve(Y_test, nb_tfidf.predict_proba(X_test_tfidf)[:, 1], 'ROC Curve: Naive Bayes (TF-IDF)')
plot_roc_curve(Y_test, nb_w2v.predict_proba(X_test_w2v_scaled)[:, 1], 'ROC Curve: Naive Bayes (Word2Vec)')
plot_roc_curve(Y_test, knn_bow.predict_proba(X_test_bow_reduced)[:, 1], 'ROC Curve: KNN (BoW)')
plot_roc_curve(Y_test, knn_tfidf.predict_proba(X_test_tfidf_reduced)[:, 1], 'ROC Curve: KNN (TF-IDF)')
plot_roc_curve(Y_test, knn_w2v.predict_proba(X_test_w2v_scaled)[:, 1], 'ROC Curve: KNN (Word2Vec)')
Explanation
Bar Plot for Accuracy Scores:

sns.barplot is used to create a bar plot of the accuracy scores of different models.
The x-axis represents different models, and the y-axis represents the accuracy scores.
Confusion Matrix:

confusion_matrix from sklearn.metrics is used to calculate the confusion matrix.
sns.heatmap is used to plot the confusion matrix.
ROC Curves:

roc_curve and auc from sklearn.metrics are used to compute the ROC curve and the area under the curve (AUC).
A plot is generated to show the ROC curve for each model.
By adding these visualizations, you can better understand and compare the performance of your different models. Adjust the code as needed based on your specific models and data.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc

# Assuming you have already calculated these accuracy scores
accuracy_scores = {
    'Naive Bayes (BoW)': accuracy_score(Y_test, nb_pred_bow),
    'Naive Bayes (TF-IDF)': accuracy_score(Y_test, nb_pred_tfidf),
    'Naive Bayes (Word2Vec)': accuracy_score(Y_test, nb_pred_w2v),
    'KNN (BoW)': accuracy_score(Y_test, knn_pred_bow),
    'KNN (TF-IDF)': accuracy_score(Y_test, knn_pred_tfidf),
    'KNN (Word2Vec)': accuracy_score(Y_test, knn_pred_w2v),
    # Include other models if you have added them, e.g., SVM, Random Forest
}

# Plotting the accuracy scores
plt.figure(figsize=(10, 6))
sns.barplot(x=list(accuracy_scores.keys()), y=list(accuracy_scores.values()))
plt.xlabel('Model')
plt.ylabel('Accuracy Score')
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45)
plt.show()

# Plotting confusion matrices
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    plt.show()

# Plot confusion matrices for each model
plot_confusion_matrix(Y_test, nb_pred_bow, 'Confusion Matrix: Naive Bayes (BoW)')
plot_confusion_matrix(Y_test, nb_pred_tfidf, 'Confusion Matrix: Naive Bayes (TF-IDF)')
plot_confusion_matrix(Y_test, nb_pred_w2v, 'Confusion Matrix: Naive Bayes (Word2Vec)')
plot_confusion_matrix(Y_test, knn_pred_bow, 'Confusion Matrix: KNN (BoW)')
plot_confusion_matrix(Y_test, knn_pred_tfidf, 'Confusion Matrix: KNN (TF-IDF)')
plot_confusion_matrix(Y_test, knn_pred_w2v, 'Confusion Matrix: KNN (Word2Vec)')

# Plotting ROC curves
def plot_roc_curve(y_true, y_pred_proba, title):
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.show()

# For ROC curves, you need predicted probabilities, not labels
# Assuming the classifiers support the `predict_proba` method
# If not, use `decision_function` method for classifiers like SVM

# Plot ROC curves for each model
plot_roc_curve(Y_test, nb_bow.predict_proba(X_test_bow)[:, 1], 'ROC Curve: Naive Bayes (BoW)')
plot_roc_curve(Y_test, nb_tfidf.predict_proba(X_test_tfidf)[:, 1], 'ROC Curve: Naive Bayes (TF-IDF)')
plot_roc_curve(Y_test, nb_w2v.predict_proba(X_test_w2v_scaled)[:, 1], 'ROC Curve: Naive Bayes (Word2Vec)')
plot_roc_curve(Y_test, knn_bow.predict_proba(X_test_bow_reduced)[:, 1], 'ROC Curve: KNN (BoW)')
plot_roc_curve(Y_test, knn_tfidf.predict_proba(X_test_tfidf_reduced)[:, 1], 'ROC Curve: KNN (TF-IDF)')
plot_roc_curve(Y_test, knn_w2v.predict_proba(X_test_w2v_scaled)[:, 1], 'ROC Curve: KNN (Word2Vec)')


In [1]:
# Importing necessary libraries
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups

# Loading sample dataset for sentiment analysis (you can replace it with your own dataset)
dataset = fetch_20newsgroups(subset='all', categories=['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'])

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=42)

# Feature extraction using Bag of Words (CountVectorizer)
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# Feature extraction using TF-IDF (TfidfVectorizer)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# SVM classifier
svm_classifier = SVC(kernel='linear')

# Training and predicting using Bag of Words features
svm_classifier.fit(X_train_counts, y_train)
y_pred_counts = svm_classifier.predict(X_test_counts)
accuracy_counts = accuracy_score(y_test, y_pred_counts)
print("Accuracy using Bag of Words:", accuracy_counts)

# Training and predicting using TF-IDF features
svm_classifier.fit(X_train_tfidf, y_train)
y_pred_tfidf = svm_classifier.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print("Accuracy using TF-IDF:", accuracy_tfidf)


Accuracy using Bag of Words: 0.9295212765957447
Accuracy using TF-IDF: 0.9680851063829787


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from gensim.models import Word2Vec
from sklearn.preprocessing import scale
import pandas as pd

# Assuming you have your data and labels
# X = data
# y = labels

# Example data (replace with your actual data)
X = ["I love this movie", "I hate this movie", "This movie is okay"]
y = [1, 0, 1]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Extraction Methods
vectorizers = {
    'Bag of Words': CountVectorizer(),
    'TF-IDF': TfidfVectorizer()
}

# Word2Vec (Assuming pre-trained model or training your own model)
w2v_model = Word2Vec(sentences=[sentence.split() for sentence in X_train], vector_size=100, window=5, min_count=1, workers=4)
def word2vec_features(sentences, model):
    features = []
    for sentence in sentences:
        words = sentence.split()
        feature_vector = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
        features.append(feature_vector)
    return scale(np.array(features))

# Classifiers
classifiers = {
    'Naive Bayes': MultinomialNB(),
    'k-NN': KNeighborsClassifier(),
    'SVM': SVC(probability=True)  # SVC for ROC Curve
}

# Initialize dictionaries to store results
results = {}

for vec_name, vectorizer in vectorizers.items():
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    for clf_name, classifier in classifiers.items():
        classifier.fit(X_train_vec, y_train)
        y_pred = classifier.predict(X_test_vec)
        
        # Evaluate
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        # Store results
        results[(vec_name, clf_name)] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'y_pred': y_pred,
            'classifier': classifier,
            'X_test_vec': X_test_vec
        }

# Word2Vec
X_train_w2v = word2vec_features(X_train, w2v_model)
X_test_w2v = word2vec_features(X_test, w2v_model)

for clf_name, classifier in classifiers.items():
    classifier.fit(X_train_w2v, y_train)
    y_pred = classifier.predict(X_test_w2v)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Store results
    results[('Word2Vec', clf_name)] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'y_pred': y_pred,
        'classifier': classifier,
        'X_test_vec': X_test_w2v
    }

# Display results


### confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [3]:
import pandas as pd
df = pd.read_csv('reviews.csv')
df.head()


Unnamed: 0,Id,Review,Label
0,0,good and interesting,5
1,1,"This class is very helpful to me. Currently, I...",5
2,2,like!Prof and TAs are helpful and the discussi...,5
3,3,Easy to follow and includes a lot basic and im...,5
4,4,Really nice teacher!I could got the point eazl...,4


In [4]:
label_counts = df["Label"].value_counts()

# Print or visualize the counts
print(label_counts)

Label
5    79173
4    18054
3     5071
1     2469
2     2251
Name: count, dtype: int64


above shows that the datset is biased

In [None]:
Certainly! Below is a modified version of a Jupyter Notebook incorporating hyperparameter tuning using Grid Search, data augmentation using SMOTE, and undersampling for handling the imbalanced dataset.

Step-by-Step Guide
1. Install Necessary Libraries
Make sure you have the required libraries installed. You can install them using pip if they are not already installed:

bash
Copy code
!pip install imbalanced-learn scikit-learn
!pip install pandas numpy
2. Import Libraries
python
Copy code
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
3. Load Your Dataset
Replace your_data.csv with the path to your dataset.

python
Copy code
# Load dataset
df = pd.read_csv('your_data.csv')
X = df['text']
y = df['label']
4. Vectorize Text Data
python
Copy code
# Vectorize text data
vectorizer = TfidfVectorizer(max_features=5000)
X_vect = vectorizer.fit_transform(X)
5. Data Augmentation
python
Copy code
# Define oversampling and undersampling strategies
over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.8)

# Create a pipeline with SMOTE and RandomUnderSampler
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# Apply pipeline to the data
X_res, y_res = pipeline.fit_resample(X_vect, y)
6. Split Data
python
Copy code
# Split the resampled data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)
7. Hyperparameter Tuning with Grid Search
python
Copy code
# Initialize the model
model = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Set up the grid search with cross-validation
cv = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(model, param_grid, scoring='f1', cv=cv, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")
8. Evaluate the Model
python
Copy code
# Predict on the test set
y_pred = best_model.predict(X_test)

# Print classification report and

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb

# Assuming `df` is your DataFrame with 'text' and 'label' columns
# Load your dataset
df = pd.read_csv('your_dataset.csv')

# Data Preprocessing
X = df['text']
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Handling Imbalanced Dataset using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_vect, y_train)

# Model Training with Grid Search
# Here we use RandomForest as an example, you can replace it with other classifiers like SVM, XGBoost, etc.
model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Using StratifiedKFold for cross-validation
cv = StratifiedKFold(n_splits=5)

# Grid Search for Hyperparameter Tuning
grid_search = GridSearchCV(model, param_grid, scoring='f1', cv=cv, n_jobs=-1)
grid_search.fit(X_res, y_res)

# Best Model Evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_vect)

# Evaluation Metrics
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Optional: Save the best model for future use
import joblib
joblib.dump(best_model, 'best_model.pkl')


## Outlier detection, PCA and SMOTE

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import LocalOutlierFactor
from imblearn.over_sampling import SMOTE

# Load your dataset
data = pd.read_csv('your_dataset.csv')

# Ensure that the dataset contains the required columns
assert 'ID' in data.columns, "The dataset must contain an 'ID' column."
assert 'Review' in data.columns, "The dataset must contain a 'Review' column."
assert 'Labels' in data.columns, "The dataset must contain a 'Labels' column."
assert 'cleaned_reviews' in data.columns, "The dataset must contain a 'cleaned_reviews' column."

# Calculate Z-scores for Labels (assuming Labels is the sentiment score)
data['z_score'] = (data['Labels'] - data['Labels'].mean()) / data['Labels'].std()

# Identify outliers based on Z-score
outliers_z = data[np.abs(data['z_score']) > 3]
data = data[np.abs(data['z_score']) <= 3]

# Visualize Labels without outliers
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['Labels'])
plt.title('Boxplot of Labels (Sentiment Scores) after Removing Z-score Outliers')
plt.show()

print("Removed outliers based on Z-score:")
print(outliers_z[['ID', 'Review', 'Labels', 'z_score']])

# Convert cleaned_reviews data to TF-IDF features
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['cleaned_reviews'])

# Apply TruncatedSVD for dimensionality reduction
svd = TruncatedSVD(n_components=10)
svd_features = svd.fit_transform(tfidf_matrix)

# Use Local Outlier Factor (LOF) to detect outliers in the reduced feature space
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
outlier_labels = lof.fit_predict(svd_features)

# Add the outlier labels to the dataset
data['outlier_lof'] = outlier_labels

# Visualize the SVD-reduced features with outliers
plt.figure(figsize=(10, 6))
plt.scatter(svd_features[:, 0], svd_features[:, 1], c=outlier_labels, cmap='coolwarm', marker='o')
plt.title('SVD-reduced Features with LOF Outliers')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

# Display the outliers detected by LOF
outliers_lof = data[data['outlier_lof'] == -1]
print("Outliers detected by LOF:")
print(outliers_lof[['ID', 'Review', 'Labels', 'outlier_lof']])

# Remove LOF outliers from the dataset
data = data[data['outlier_lof'] != -1]

# Apply SMOTE to balance the dataset
smote = SMOTE()
tfidf_matrix_resampled, labels_resampled = smote.fit_resample(tfidf_matrix[data.index], data['Labels'])

# Optionally, apply TruncatedSVD again to the resampled data for visualization
svd_resampled = TruncatedSVD(n_components=2)
svd_features_resampled = svd_resampled.fit_transform(tfidf_matrix_resampled)

plt.figure(figsize=(10, 6))
plt.scatter(svd_features_resampled[:, 0], svd_features_resampled[:, 1], c=labels_resampled, cmap='viridis', marker='o')
plt.title('SVD-reduced Features after Applying SMOTE')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

# Final cleaned and balanced dataset
# Note: tfidf_matrix_resampled is still in sparse format
cleaned_data = csr_matrix(tfidf_matrix_resampled)  # Keep the data in sparse format
labels_resampled = np.array(labels_resampled).reshape(-1, 1)  # Convert labels to a 2D array


print("Cleaned and balanced dataset:")
print(cleaned_data.head())


In [None]:
#savig the data
import scipy.sparse as sp
import numpy as np

# Assuming `cleaned_data` is your sparse TF-IDF matrix and `labels_resampled` is your labels array

# Save the sparse matrix in .npz format (or choose another sparse format)
sp.save_npz('cleaned_data.npz', cleaned_data)

# Save the labels as a NumPy array (.npy)
np.save('labels_resampled.npy', labels_resampled)

print("Cleaned and balanced dataset saved successfully!")
