## Data Classes

In [1]:
class Review:
    def __init__(self, text, sentiment):
        self.text = text
        self.sentiment = sentiment


## Load Data

In [4]:
import pandas as pd

#loads csv file fi dataframe(pandas object ki table)
df = pd.read_csv("IMDB_DATASET.csv")
#only took 5000 from the dataset that has 50K
df = df.sample(n=5000, random_state=42) 


reviews = []

for _, row in df.iterrows():
    reviews.append(Review(row['review'],  row['sentiment']))
        
len(reviews)


5000

## Prep Data

In [5]:
from sklearn.model_selection import train_test_split

training, test = train_test_split (reviews, test_size=0.33, random_state=81)



In [6]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

### Bag of words vectorization

In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)


## Classifiers

### Linear SVM

In [34]:
from sklearn import svm

clf_svm = svm.SVC(kernel=('linear'))
clf_svm.fit(train_x_vectors, train_y)
#print(test_x[0])
clf_svm.predict(test_x_vectors[0])


array(['negative'], dtype='<U8')

### Decision Tree

In [35]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)
clf_dec.predict(test_x_vectors[0])


array(['negative'], dtype='<U8')

### Naive Bayes

In [36]:
from sklearn.naive_bayes import GaussianNB

train_x_dense = train_x_vectors.toarray()
test_x_dense = test_x_vectors.toarray()

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_dense, train_y)
clf_gnb.predict(test_x_dense[0].reshape(1, -1))

array(['negative'], dtype='<U8')

### Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[0])

array(['negative'], dtype='<U8')

## Evaluation

In [38]:
# Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_dense, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8606060606060606
0.6757575757575758
0.6139393939393939
0.8484848484848485


In [39]:
# F1 Scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=["positive", "negative"])
f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=["positive", "negative"])
f1_score(test_y, clf_gnb.predict(test_x_dense), average=None, labels=["positive", "negative"])
f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=["positive", "negative"])

array([0.85154394, 0.84529703])

In [31]:
test_set=["shit", "i don't know about this", "my mother agrees"]
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['negative', 'negative', 'positive'], dtype='<U8')

### Tuning our model (with Gread Search)

In [40]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

0,1,2
,estimator,SVC()
,param_grid,"{'C': (1, ...), 'kernel': ('linear', ...)}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,4
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


## Saving model

In [48]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)


## Load model

In [49]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [51]:
print(test_x[0])

loaded_clf.predict(test_x_vectors[0])

My wife and I thought that with this cast and director, the movie would have to be at least worth watching. We were wrong. In fact, we gave up on it after 45 minutes. The idea that Crawford, Young and Tone are British but speak with American accents was, for me, impossible to get past -- hard to believe this is England when no one talks with a British accent. There is zero chemistry between Crawford and anyone, and to echo a previous comment, the idea that Cooper and Crawford suddenly declare their love for one another without any reason is ludicrous. There is no reason to care about any of the characters, which is why we threw in the towel halfway through. I found it hard to believe that Hawks directed this, as none of the actors spoke with the trademark Hawksian rat-a-tat delivery. So save your time, and skip this one.


array(['negative'], dtype='<U8')