In [58]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [59]:
#load a dataset for data processing and clean datasets
df=pd.read_csv("sentiment_analysis_data.csv", encoding='latin-1')

In [60]:
# Initialize a CountVectorizer
# max_features: Maximum number of features (vocabulary size) to consider
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1000)

In [61]:
x=cv.fit_transform(df["comment"]).toarray()

In [62]:

y=df["label"].values

In [63]:
# This function is used to split datasets into training and testing sets
from sklearn.model_selection import train_test_split

In [64]:
# Split the dataset into training and testing sets
# x: Input features
# y: Target labels
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)

In [65]:
x_train.shape

(32915, 1000)

In [66]:
# Initialize a Logistic Regression model
# max_iter: Maximum number of iterations for the optimization algorithm to converge
from sklearn.linear_model import LogisticRegression

In [67]:
model=LogisticRegression(max_iter=1000)

In [68]:
# Fit the Logistic Regression model to the training data
model.fit(x_train,y_train)

In [69]:
# Calculate accuracy on the training data
# y_train: Actual target labels of the training set
# x_train_prediction: Predicted target labels by the model on the training set
x_train_prediction = model.predict(x_train)
training_data_accuracy=accuracy_score(y_train,x_train_prediction)
training_data_accuracy

0.6809053622968252

In [70]:
# Calculate accuracy on the testing data
# y_test: Actual target labels of the testing set
# x_test_prediction: Predicted target labels by the model on the testing set
x_test_prediction = model.predict(x_test)
test_data_accuracy=accuracy_score(y_test,x_test_prediction)
test_data_accuracy

0.6480738850407097

### evaluating the model on both the training and testing datasets, we can gain insights into its performance and detect potential issues such as overfitting. If the training set accuracy is high but the testing set accuracy is low, it suggests overfitting, and we may need to adjust the model or collect more diverse training data. but there are no more differences so the model is not overfitting

### calculate common performance metrics such as accuracy, precision, recall, and F1-score using scikit-learn:

In [79]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test,x_test_prediction)

# Calculate precision
precision = precision_score(y_test,x_test_prediction)

# Calculate recall
recall = recall_score(y_test,x_test_prediction)

# Calculate F1-score
f1 = f1_score(y_test,x_test_prediction)

# Generate classification report
class_report = classification_report(y_test,x_test_prediction)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Classification Report:\n", class_report)

Accuracy: 0.6480738850407097
Precision: 0.6231561369329252
Recall: 0.5921713832319492
F1-score: 0.607268782207757
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.70      0.68      4448
           1       0.62      0.59      0.61      3781

    accuracy                           0.65      8229
   macro avg       0.65      0.64      0.64      8229
weighted avg       0.65      0.65      0.65      8229



In [71]:
classifier = RandomForestClassifier(n_estimators=100)

In [72]:
classifier.fit(x_train, y_train)

In [73]:
x_train_prediction = classifier.predict(x_train)
training_data_accuracy=accuracy_score(y_train,x_train_prediction)
training_data_accuracy


0.9604739480480025

In [74]:
x_test_predictionc = classifier.predict(x_test)
test_data_accuracyc=accuracy_score(y_test,x_test_predictionc)
test_data_accuracyc

0.5520719406975331

### the model is overfitting. now need to improve the model using GridSearchCV 

### calculate common performance metrics such as accuracy, precision, recall, and F1-score using scikit-learn:

In [80]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test,x_test_predictionc)

# Calculate precision
precision = precision_score(y_test,x_test_predictionc)

# Calculate recall
recall = recall_score(y_test,x_test_predictionc)

# Calculate F1-score
f1 = f1_score(y_test,x_test_predictionc)

# Generate classification report
class_report = classification_report(y_test,x_test_predictionc)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Classification Report:\n", class_report)

Accuracy: 0.5520719406975331
Precision: 0.5129392536093708
Recall: 0.49801639777836554
F1-score: 0.5053676865271068
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.60      0.59      4448
           1       0.51      0.50      0.51      3781

    accuracy                           0.55      8229
   macro avg       0.55      0.55      0.55      8229
weighted avg       0.55      0.55      0.55      8229



In [75]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

# Initialize Naive Bayes classifiers
clf1 = GaussianNB()       # Gaussian Naive Bayes
clf2 = MultinomialNB()    # Multinomial Naive Bayes
clf3 = BernoulliNB()      # Bernoulli Naive Bayes


In [76]:
clf1.fit(x_train, y_train)  # Fit Gaussian Naive Bayes model to the training data
clf2.fit(x_train, y_train)  # Fit Multinomial Naive Bayes model to the training data
clf3.fit(x_train, y_train)  # Fit Bernoulli Naive Bayes model to the training data


In [77]:
# Make predictions using the Gaussian Naive Bayes classifier
y_pred = clf1.predict(x_test)

# Make predictions using the Multinomial Naive Bayes classifier
y_pred2 = clf2.predict(x_test)

# Make predictions using the Bernoulli Naive Bayes classifier
y_pred3 = clf3.predict(x_test)

In [78]:
# Calculate accuracy of the Gaussian Naive Bayes classifier
print("Gaussian Naive Bayes classifier :" ,accuracy_score(y_test, y_pred))

# Calculate accuracy of the Multinomial Naive Bayes classifier
print("Multinomial Naive Bayes classifier :" ,accuracy_score(y_test, y_pred2))

# Calculate accuracy of the Bernoulli Naive Bayes classifier
print(" Bernoulli Naive Bayes classifier",accuracy_score(y_test, y_pred3))

Gaussian Naive Bayes classifier : 0.5876777251184834
Multinomial Naive Bayes classifier : 0.6498967067687447
 Bernoulli Naive Bayes classifier 0.6424838984080691


### calculate common performance metrics such as accuracy, precision, recall, and F1-score using scikit-learn:Gaussian Naive Bayes classifier

In [81]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision
precision = precision_score(y_test, y_pred)

# Calculate recall
recall = recall_score(y_test, y_pred)

# Calculate F1-score
f1 = f1_score(y_test, y_pred)

# Generate classification report
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Classification Report:\n", class_report)

Accuracy: 0.5876777251184834
Precision: 0.5593635250917993
Recall: 0.48346998148637926
F1-score: 0.5186551283870052
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.68      0.64      4448
           1       0.56      0.48      0.52      3781

    accuracy                           0.59      8229
   macro avg       0.58      0.58      0.58      8229
weighted avg       0.58      0.59      0.58      8229



### calculate common performance metrics such as accuracy, precision, recall, and F1-score using scikit-learn:Multinomial Naive Bayes classifier

In [82]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred2)

# Calculate precision
precision = precision_score(y_test, y_pred2)

# Calculate recall
recall = recall_score(y_test, y_pred2)

# Calculate F1-score
f1 = f1_score(y_test, y_pred2)

# Generate classification report
class_report = classification_report(y_test, y_pred2)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Classification Report:\n", class_report)

Accuracy: 0.6498967067687447
Precision: 0.6386321626617375
Recall: 0.5482676540597725
F1-score: 0.5900099615767753
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.74      0.69      4448
           1       0.64      0.55      0.59      3781

    accuracy                           0.65      8229
   macro avg       0.65      0.64      0.64      8229
weighted avg       0.65      0.65      0.65      8229



### calculate common performance metrics such as accuracy, precision, recall, and F1-score using scikit-learn:Bernoulli Naive Bayes classifier

In [83]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred3)

# Calculate precision
precision = precision_score(y_test, y_pred3)

# Calculate recall
recall = recall_score(y_test, y_pred3)

# Calculate F1-score
f1 = f1_score(y_test, y_pred3)

# Generate classification report
class_report = classification_report(y_test, y_pred3)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Classification Report:\n", class_report)

Accuracy: 0.6424838984080691
Precision: 0.6322926521602018
Recall: 0.5302829939169532
F1-score: 0.576812428078251
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.74      0.69      4448
           1       0.63      0.53      0.58      3781

    accuracy                           0.64      8229
   macro avg       0.64      0.63      0.63      8229
weighted avg       0.64      0.64      0.64      8229



## "Overall, I chose the Multinomial Naive Bayes classifier as it is a popular and effective choice for sentiment analysis tasks, especially when dealing with text data. Its simplicity, efficiency, and good performance make it well-suited for analyzing sentiment in text."