<a href="https://colab.research.google.com/github/ayush-710/Paradox/blob/main/modelBuildingForSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Defining,Training and Testing The Model and Generating Classification Reports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import drive
import pickle
from sklearn.metrics import accuracy_score, classification_report
drive.mount('/content/drive',force_remount=True)
df1 = pd.read_csv('/content/drive/MyDrive/MinorProjectDatasets/IMDB_Dataset_Cleaned.csv')

# Drop rows with missing values
df1 = df1.dropna(subset=['cleaned_text'])
df1 = df1.dropna(subset=['sentiment'])
# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(df1['cleaned_text'], df1['sentiment'], test_size=0.30, random_state=2)

# Convert sentiments to 1 and 0
y_train = (y_train.replace({'positive': 1, 'negative': 0})).values
y_test = (y_test.replace({'positive': 1, 'negative': 0})).values

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(strip_accents=None,
                         lowercase=True,
                         preprocessor=None,  # applied preprocessor in Data Cleaning
                         tokenizer=None,
                         use_idf=True,
                         norm='l2',
                         smooth_idf=True)


# Transform text data
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)
# Model training and evaluation
classifiers = {
    "Naive_Bayes": MultinomialNB(),
    "Logistic_Regression": LogisticRegression(),
    # "KNN": KNeighborsClassifier(),
    "Linear_SVC": LinearSVC(),
    "Stochastic_Gradient_Classifier": SGDClassifier(),
    # "Random_Forest": RandomForestClassifier()
}

print("With Data Pre-Processing:")
for name, clf in classifiers.items():
    clf.fit(x_train_tfidf, y_train)
    accuracy = metrics.accuracy_score(clf.predict(x_test_tfidf), y_test)
    print(f"{name} accuracy_score = {accuracy * 100:.2f}%")

print("With Data Pre-Processing and TF-IDF:")
for name, clf in classifiers.items():
    clf.fit(x_train_tfidf, y_train)
    y_pred = clf.predict(x_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} accuracy_score = {accuracy * 100:.2f}%")

    # Classification report (ensure consistent target names)
    report = classification_report(y_test, y_pred, target_names=['Positive', 'Negative'])
    print(f"{name} classification report:\n{report}")



Mounted at /content/drive
With Data Pre-Processing:
Naive_Bayes accuracy_score = 86.75%
Logistic_Regression accuracy_score = 89.43%
Linear_SVC accuracy_score = 89.55%
Stochastic_Gradient_Classifier accuracy_score = 89.54%
With Data Pre-Processing and TF-IDF:
Naive_Bayes accuracy_score = 86.75%
Naive_Bayes classification report:
              precision    recall  f1-score   support

    Positive       0.86      0.88      0.87      7499
    Negative       0.87      0.86      0.87      7501

    accuracy                           0.87     15000
   macro avg       0.87      0.87      0.87     15000
weighted avg       0.87      0.87      0.87     15000

Logistic_Regression accuracy_score = 89.43%
Logistic_Regression classification report:
              precision    recall  f1-score   support

    Positive       0.91      0.88      0.89      7499
    Negative       0.88      0.91      0.90      7501

    accuracy                           0.89     15000
   macro avg       0.89      0.89     

# Performing Grid Search for hyperparameter tuning and evaluating the model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import clone  # Import clone function


def grid_search_evaluate(model_name, model_class, param_grid):
    """
    Performs Grid Search for hyperparameter tuning and evaluates the model.

    Args:
        model_name: Name of the model being evaluated.
        model_class: The model class (e.g., MultinomialNB).
        param_grid: Dictionary containing hyperparameter names and their values to explore.

    Returns:
        None (prints results)
    """
    # Perform Grid Search
    grid_search = GridSearchCV(model_class(), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(x_train_tfidf, y_train)
    best_model = grid_search.best_estimator_
    best_accuracy = grid_search.best_score_

    # Evaluate and print results
    y_pred = best_model.predict(x_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} (Grid Search):")
    print(f"Best Accuracy: {best_accuracy:.2f}")
    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"{model_name} classification report:\n{classification_report(y_test, y_pred, target_names=['Positive', 'Negative'])}")
    print("-" * 80)  # Separator for clarity

# Define hyperparameter grids (adjust values and parameters as needed)
multinomialnb_grid = {'alpha': [0.01, 0.1, 1]}
logistic_regression_grid = {'C': [0.01, 0.1, 1], 'penalty': ['l1', 'l2']}
# knn_grid = {'n_neighbors': range(1, 10), 'algorithm': ['auto', 'ball_tree', 'kd_tree']}
linear_svc_grid = {'C': [0.01, 0.1, 1]}
sgdc_grid = {'alpha': [0.0001, 0.001, 0.01], 'loss': ['hinge', 'log']}
# random_forest_grid = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15]}

# Evaluate
models = [
    ('MultinomialNB', MultinomialNB),
    ('Logistic_Regression', LogisticRegression),
    # ('KNN', KNeighborsClassifier),
    ('Linear_SVC', LinearSVC),
    ('SGDC', SGDClassifier),
    # ('Random_Forest', RandomForestClassifier),
]

for name, model_class in models:
    grid_search_evaluate(name, model_class, eval(name.lower() + '_grid'))  # Use corresponding grid

print("Grid Search Evaluation Completed!")


MultinomialNB (Grid Search):
Best Accuracy: 0.86
Test Accuracy: 86.75%
Best Parameters: {'alpha': 1}
MultinomialNB classification report:
              precision    recall  f1-score   support

    Positive       0.86      0.88      0.87      7499
    Negative       0.87      0.86      0.87      7501

    accuracy                           0.87     15000
   macro avg       0.87      0.87      0.87     15000
weighted avg       0.87      0.87      0.87     15000

--------------------------------------------------------------------------------


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



Logistic_Regression (Grid Search):
Best Accuracy: 0.89
Test Accuracy: 89.43%
Best Parameters: {'C': 1, 'penalty': 'l2'}
Logistic_Regression classification report:
              precision    recall  f1-score   support

    Positive       0.91      0.88      0.89      7499
    Negative       0.88      0.91      0.90      7501

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000

--------------------------------------------------------------------------------
Linear_SVC (Grid Search):
Best Accuracy: 0.89
Test Accuracy: 89.59%
Best Parameters: {'C': 0.1}
Linear_SVC classification report:
              precision    recall  f1-score   support

    Positive       0.91      0.88      0.89      7499
    Negative       0.88      0.91      0.90      7501

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90    



SGDC (Grid Search):
Best Accuracy: 0.89
Test Accuracy: 89.55%
Best Parameters: {'alpha': 0.0001, 'loss': 'hinge'}
SGDC classification report:
              precision    recall  f1-score   support

    Positive       0.91      0.88      0.89      7499
    Negative       0.88      0.91      0.90      7501

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000

--------------------------------------------------------------------------------
Grid Search Evaluation Completed!


Saving the TF-IDF Vectorizer to Googlr Drive for Project Implementation

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Assuming "MinorProjectDatasets" is in the root of your drive
basePath = '/content/drive/MyDrive/MinorProjectDatasets'
import pickle

# Save the TF-IDF vectorizer
with open(f'{basePath}/tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)


# Saving the Models to Google Drive

In [None]:
# Save models
for name, clf in classifiers.items():
    model_path = f'/content/drive/MyDrive/MinorProjectDatasets/{name}.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(clf, f)

# Random Forest Model

1.   Model Creation,training and testing
2.   Model Evaluation





In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from google.colab import drive
import pickle
drive.mount('/content/drive',force_remount=True)
df1 = pd.read_csv('/content/drive/MyDrive/MinorProjectDatasets/IMDB_Dataset_Cleaned.csv')

# Drop rows with missing values
df1 = df1.dropna(subset=['cleaned_text'])
df1 = df1.dropna(subset=['sentiment'])
# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(df1['cleaned_text'], df1['sentiment'], test_size=0.30, random_state=2)

# Convert sentiments to 1 and 0
y_train = (y_train.replace({'positive': 1, 'negative': 0})).values
y_test = (y_test.replace({'positive': 1, 'negative': 0})).values

# # Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(strip_accents=None,
                         lowercase=True,
                         preprocessor=None,  # applied preprocessor in Data Cleaning
                         tokenizer=None,
                         use_idf=True,
                         norm='l2',
                         smooth_idf=True)


# Transform text data
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)
# Model training and evaluation
classifiers = {
    # "Naive_Bayes": MultinomialNB(),
    # "Logistic_Regression": LogisticRegression(),
    # "KNN": KNeighborsClassifier(),
    # "Linear_SVC": LinearSVC(),
    # "Stochastic_Gradient_Classifier": SGDClassifier(),
    "Random_Forest": RandomForestClassifier()
}

print("With Data Pre-Processing:")
for name, clf in classifiers.items():
    clf.fit(x_train_tfidf, y_train)
    y_pred = clf.predict(x_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} accuracy_score = {accuracy * 100:.2f}%")



# Classification report (ensure consistent target names)
    report = classification_report(y_test, y_pred, target_names=['Positive', 'Negative'])
    print(f"{name} classification report:\n{report}")

Mounted at /content/drive
With Data Pre-Processing:
Random_Forest accuracy_score = 85.77%
Random_Forest classification report:
              precision    recall  f1-score   support

    Positive       0.86      0.86      0.86      7499
    Negative       0.86      0.86      0.86      7501

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000



Performing Hyperparameter Tuning on RF Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import clone  # Import clone function


def grid_search_evaluate(model_name, model_class, param_grid):
    """
    Performs Grid Search for hyperparameter tuning and evaluates the model.

    Args:
        model_name: Name of the model being evaluated.
        model_class: The model class (e.g., MultinomialNB).
        param_grid: Dictionary containing hyperparameter names and their values to explore.

    Returns:
        None (prints results)
    """
    # Perform Grid Search
    grid_search = GridSearchCV(model_class(), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(x_train_tfidf, y_train)
    best_model = grid_search.best_estimator_
    best_accuracy = grid_search.best_score_

    # Evaluate and print results
    y_pred = best_model.predict(x_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} (Grid Search):")
    print(f"Best Accuracy: {best_accuracy:.2f}")
    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"{model_name} classification report:\n{classification_report(y_test, y_pred, target_names=['Positive', 'Negative'])}")
    print("-" * 80)  # Separator for clarity

# Define hyperparameter grids (adjust values and parameters as needed)
# multinomialnb_grid = {'alpha': [0.01, 0.1, 1]}
# logistic_regression_grid = {'C': [0.01, 0.1, 1], 'penalty': ['l1', 'l2']}
# knn_grid = {'n_neighbors': range(1, 10), 'algorithm': ['auto', 'ball_tree', 'kd_tree']}
# linear_svc_grid = {'C': [0.01, 0.1, 1]}
# sgdc_grid = {'alpha': [0.0001, 0.001, 0.01], 'loss': ['hinge', 'log']}
random_forest_grid = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15]}

# Evaluate
models = [
    # ('MultinomialNB', MultinomialNB),
    # ('Logistic_Regression', LogisticRegression),
    #  ('KNN', KNeighborsClassifier),
    # ('Linear_SVC', LinearSVC),
    # ('SGDC', SGDClassifier),
     ('Random_Forest', RandomForestClassifier),
]

for name, model_class in models:
    grid_search_evaluate(name, model_class, eval(name.lower() + '_grid'))  # Use corresponding grid

print("Grid Search Evaluation Completed!")


Random_Forest (Grid Search):
Best Accuracy: 0.85
Test Accuracy: 84.89%
Best Parameters: {'max_depth': 15, 'n_estimators': 300}
Random_Forest classification report:
              precision    recall  f1-score   support

    Positive       0.87      0.82      0.84      7499
    Negative       0.83      0.88      0.85      7501

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000

--------------------------------------------------------------------------------
Grid Search Evaluation Completed!


# Saving RF Model

In [None]:
# Save models
for name, clf in classifiers.items():
    model_path = f'/content/drive/MyDrive/MinorProjectDatasets/{name}.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(clf, f)

# Training and Testing the KNN Model and evaluating the model


In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.linear_model import SGDClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import LinearSVC
# from sklearn import metrics
# from sklearn.feature_extraction.text import TfidfVectorizer
# from google.colab import drive
# import pickle
# drive.mount('/content/drive',force_remount=True)
# df1 = pd.read_csv('/content/drive/MyDrive/MinorProjectDatasets/IMDB_Dataset_Cleaned.csv')

# # Drop rows with missing values
# df1 = df1.dropna(subset=['cleaned_text'])
# df1 = df1.dropna(subset=['sentiment'])
# # Split data into train and test sets
# x_train, x_test, y_train, y_test = train_test_split(df1['cleaned_text'], df1['sentiment'], test_size=0.30, random_state=2)

# # Convert sentiments to 1 and 0
# y_train = (y_train.replace({'positive': 1, 'negative': 0})).values
# y_test = (y_test.replace({'positive': 1, 'negative': 0})).values

# # Initialize TF-IDF vectorizer
# tfidf = TfidfVectorizer(strip_accents=None,
#                          lowercase=True,
#                          preprocessor=None,  # applied preprocessor in Data Cleaning
#                          tokenizer=None,
#                          use_idf=True,
#                          norm='l2',
#                          smooth_idf=True)


# # Transform text data
# x_train_tfidf = tfidf.fit_transform(x_train)
# x_test_tfidf = tfidf.transform(x_test)
# # Model training and evaluation
# classifiers = {
#     # "Naive_Bayes": MultinomialNB(),
#     # "Logistic_Regression": LogisticRegression(),
#     "KNN": KNeighborsClassifier(),
#     # "Linear_SVC": LinearSVC(),
#     # "Stochastic_Gradient_Classifier": SGDClassifier(),
#     # "Random_Forest": RandomForestClassifier()
# }

# print("With Data Pre-Processing:")
# for name, clf in classifiers.items():
#     clf.fit(x_train_tfidf, y_train)
#     y_pred = clf.predict(x_test_tfidf)
#     accuracy = accuracy_score(y_test, y_pred)
#     print(f"{name} accuracy_score = {accuracy * 100:.2f}%")



# # for name, model_class in models:
# #     grid_search_evaluate(name, model_class, eval(name.lower() + '_grid'))  # Use corresponding grid

# # print("Grid Search Evaluation Completed!")



Mounted at /content/drive
With Data Pre-Processing:


# Saving the KNN Model

In [None]:
# # Save models
# for name, clf in classifiers.items():
#     model_path = f'/content/drive/MyDrive/MinorProjectDatasets/{name}.pkl'
#     with open(model_path, 'wb') as f:
#         pickle.dump(clf, f)