#Import Library

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import nltk
import re
from nltk.corpus import stopwords
import string
from google.colab import drive
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,classification_report

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


#Read Data

In [None]:
data = pd.read_csv('/content/drive/MyDrive/DSM COURSE/NOTEBOOK/NLP/Sentiment Anslysist/IMDB Dataset.csv')

In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


#Text Cleansing

In [None]:
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["review"] = data["review"].apply(clean)

In [None]:
x = np.array(data['review'])
y = np.array(data['sentiment'])
cv = CountVectorizer()
X = cv.fit_transform(x)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

#Modeling

##SGDClassifier

In [None]:
sgdmodel = SGDClassifier()
sgdmodel.fit(X_train,y_train)

In [None]:
y_pred = sgdmodel.predict(X_test)

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print metrics
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1))
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Accuracy: 0.87
Precision: 0.87
Recall: 0.87
F1 Score: 0.87

Confusion Matrix:
[[4204  757]
 [ 517 4522]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.89      0.85      0.87      4961
    positive       0.86      0.90      0.88      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



##MultinomialNB

In [None]:
mnbmodel = MultinomialNB()
mnbmodel.fit(X_train,y_train)

In [None]:
ypred2 = mnbmodel.predict(X_test)

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test, ypred2)
precision = precision_score(y_test, ypred2, average='weighted')
recall = recall_score(y_test, ypred2, average='weighted')
f1 = f1_score(y_test, ypred2, average='weighted')
conf_matrix = confusion_matrix(y_test, ypred2)
class_report = classification_report(y_test, ypred2)

# Print metrics
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1))
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Accuracy: 0.85
Precision: 0.85
Recall: 0.85
F1 Score: 0.85

Confusion Matrix:
[[4320  641]
 [ 821 4218]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.87      0.86      4961
    positive       0.87      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



Afther comparation we decide to use SGDClassifier

In [None]:
newdata = input('Enter new sentyment: ')
data = cv.transform([newdata]).toarray()
output = sgdmodel.predict(data)
print(output)

Enter new sentyment:  really don't like the movie. it is so bad
['negative']
