In [19]:
import numpy as np
import pandas as pd
import re
import sklearn
import nltk
import bs4
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
import nltk
nltk.download('wordnet')
from sklearn.metrics import f1_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from google.colab import files
uploaded = files.upload()

Saving Part4_Dataset.csv to Part4_Dataset.csv


In [6]:
df = pd.read_csv("Part4_Dataset.csv")

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [9]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [10]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [11]:
sentiment_values=pd.get_dummies(df['sentiment'])
sentiment_values=sentiment_values.iloc[:,1].values

In [14]:
lemmatizer=WordNetLemmatizer()
corpus=[]
stop_words = set(stopwords.words('english'))
for i in range (0,len(df)):
    soup = bs4.BeautifulSoup(df ['review'][i], "html.parser")
    review=re.sub('\[[^]]*\]',' ',df['review'][i])
    review=re.sub('[^a-zA-Z]',' ',df['review'][i])
    review=review.lower()
    review= review. split()
    review=[word for word in review if not word in stop_words]
    review = [lemmatizer.lemmatize(word) for word in review]
    review=' '.join (review)
    corpus.append(review)

In [15]:
tfidfvect=TfidfVectorizer(ngram_range=(1, 3))
tfidf_review=tfidfvect.fit_transform(corpus)

In [26]:
tfidf_train,tfidf_test,sentiment_values_train,sentiment_values_test=train_test_split(tfidf_review,sentiment_values,test_size=0.20,random_state=0)

**SVC**

In [21]:
xt,xcv,yt,ycv=train_test_split(tfidf_train,sentiment_values_train,test_size=0.20,random_state=0)

In [23]:
f1_scores=[]
C=[0.05,0.5,1,10,100]
for c in C:
  inear_svc = LinearSVC(C=c, random_state=42)
  linear_svc.fit(xt, yt)
  predict = linear_svc.predict(xcv)
  x=f1_score(ycv, predict, average='macro')
  f1_scores.append(x)



In [24]:
f1_scores

[0.8984723590997649,
 0.8984723590997649,
 0.8984723590997649,
 0.8984723590997649,
 0.8984723590997649]

In [27]:
linear_svc = LinearSVC(C=0.5, random_state=42)
linear_svc.fit(tfidf_train, sentiment_values_train)

predict = linear_svc.predict(tfidf_test)

In [28]:
report=classification_report(sentiment_values_test, predict,target_names=['Negative','Positive'])
print("Classification Report: \n",report)

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.91      0.89      0.90      5035
    Positive       0.89      0.91      0.90      4965

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [20]:
f1_score(sentiment_values_test, predict, average='macro')

0.8964988728727256

In [29]:
matrix=confusion_matrix(sentiment_values_test, predict)
print("Confusion Matrix: \n",matrix)

Confusion Matrix: 
 [[4466  569]
 [ 466 4499]]


In [30]:
accuracy=accuracy_score(sentiment_values_test, predict)
print("Accuracy: \n", accuracy)

Accuracy: 
 0.8965


**Naive Bayes**

In [32]:
from sklearn.naive_bayes import MultinomialNB

In [33]:
xt,xcv,yt,ycv=train_test_split(tfidf_train,sentiment_values_train,test_size=0.20,random_state=0)

In [36]:
alphas =[0.001, 0.01, 0.1, 1, 10, 100,1000]
f1_scores_cv=[]
f1_scores_tr=[]
for alphap in alphas:
  mnb = MultinomialNB(alpha=alphap)
  mnb.fit(xt, yt)
  ypred_cv =mnb.predict(xcv)
  ypred_tr =mnb.predict(xt)
  x_cv=f1_score(ycv, ypred_cv, average='macro')
  x_tr=f1_score(yt, ypred_tr, average='macro')
  f1_scores_cv.append(x_cv)
  f1_scores_tr.append(x_tr)


In [37]:
f1_scores_tr

[1.0,
 1.0,
 1.0,
 0.9974687321402244,
 0.9534062117330313,
 0.9056180802568294,
 0.7844585108914207]

In [38]:
f1_scores_cv

[0.8701135842017365,
 0.883855747275357,
 0.8953672163018251,
 0.8911249846894509,
 0.8753609294174225,
 0.8536957851443826,
 0.7456667460131402]

In [40]:
mnb=MultinomialNB(alpha=1)
mnb.fit(tfidf_train, sentiment_values_train)

MultinomialNB(alpha=1)

In [41]:
predict=mnb.predict(tfidf_test)
report=classification_report(sentiment_values_test, predict,target_names=['Negative','Positive'])
print("Classification Report: \n",report)
accuracy=accuracy_score(sentiment_values_test, predict)
print("Accuracy: \n", accuracy)

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.89      0.89      0.89      5035
    Positive       0.89      0.88      0.89      4965

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Accuracy: 
 0.8867


**Logistic** **Regression**

In [42]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()


In [None]:
logisticRegr.fit(tfidf_train, sentiment_values_train)

In [43]:
alphas =[0.001, 0.01, 0.1, 1, 10, 100,1000]
f1_scores_cv=[]
f1_scores_tr=[]
for alphap in alphas:
  lr = LogisticRegression(C=alphap)
  lr.fit(xt, yt)
  ypred_cv =lr.predict(xcv)
  ypred_tr =lr.predict(xt)
  x_cv=f1_score(ycv, ypred_cv, average='macro')
  x_tr=f1_score(yt, ypred_tr, average='macro')
  f1_scores_cv.append(x_cv)
  f1_scores_tr.append(x_tr)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [44]:
f1_scores_cv

[0.45814209162688074,
 0.7938794942243843,
 0.8428258755385385,
 0.8824431024615914,
 0.8976,
 0.9018561057668617,
 0.901732747955563]

In [45]:
f1_scores_tr

[0.477415570838235,
 0.8313436320535321,
 0.8907643576665859,
 0.9698407151925138,
 0.9999062488309485,
 1.0,
 1.0]

In [46]:
logisticRegr = LogisticRegression(C=100)
logisticRegr.fit(tfidf_train, sentiment_values_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=100)

In [47]:
predict=logisticRegr.predict(tfidf_test)
report=classification_report(sentiment_values_test, predict,target_names=['Negative','Positive'])
print("Classification Report: \n",report)
accuracy=accuracy_score(sentiment_values_test, predict)
print("Accuracy: \n", accuracy)

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.91      0.89      0.90      5035
    Positive       0.89      0.91      0.90      4965

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Accuracy: 
 0.8996


**KNN**

In [48]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
neigh = KNeighborsClassifier(n_neighbors=2)

In [49]:
ns =[1,2,3,4,5]
f1_scores_cv=[]
f1_scores_tr=[]
for n in ns:
  neigh = KNeighborsClassifier(n_neighbors=n)
  neigh.fit(xt, yt)
  ypred_cv =neigh.predict(xcv)
  ypred_tr =neigh.predict(xt)
  x_cv=f1_score(ycv, ypred_cv, average='macro')
  x_tr=f1_score(yt, ypred_tr, average='macro')
  f1_scores_cv.append(x_cv)
  f1_scores_tr.append(x_tr)


In [50]:
f1_scores_tr

[1.0,
 0.8729982389171069,
 0.8922972099770499,
 0.8522724093122265,
 0.8545563708452725]

In [51]:
f1_scores_cv

[0.7339957439319029,
 0.7220258875835933,
 0.7578590149115313,
 0.7468825015676779,
 0.7568748138572794]

In [52]:
neigh = KNeighborsClassifier(n_neighbors=5)
pred = neigh.fit(tfidf_train, sentiment_values_train)
pred = neigh.predict(tfidf_test)

In [53]:
report=classification_report(sentiment_values_test, pred,target_names=['Negative','Positive'])
print("Classification Report: \n",report)
accuracy=accuracy_score(sentiment_values_test, pred)
print("Accuracy: \n", accuracy)

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.78      0.76      0.77      5035
    Positive       0.76      0.78      0.77      4965

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000

Accuracy: 
 0.7694


**Decision Tree Classifier**

In [54]:
from sklearn.tree import DecisionTreeClassifier

In [55]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(tfidf_train, sentiment_values_train)
pred = clf.predict(tfidf_test)

In [56]:
report=classification_report(sentiment_values_test, pred,target_names=['Negative','Positive'])
print("Classification Report: \n",report)
accuracy=accuracy_score(sentiment_values_test, pred)
print("Accuracy: \n", accuracy)

Classification Report: 
               precision    recall  f1-score   support

    Negative       0.73      0.73      0.73      5035
    Positive       0.72      0.72      0.72      4965

    accuracy                           0.73     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.73      0.73      0.73     10000

Accuracy: 
 0.7255


Since NLP data after vectorization would have high dimension data there is high probability for finding a linear searating hyperplane hence used SVM and logistic regression

Since Naive Bayes is popular Baseline for Text classification used Naive Bayes 

For varaibility of models used instace based KNN and Decision Trees 

Best performance observed  in logreg and SVM

Researchec on various text vectorizers like BOW ,TFIDF,glov,WORD2VEC vectorizers and preprocessing  required 