## Movie Review Classification

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('moviereviews.tsv',sep='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [3]:
len(df)

2000

In [4]:
df.isnull().sum() 
#35 review are null values

label      0
review    35
dtype: int64

In [6]:
df.dropna(inplace=True) #Does not handle empty string
print(df.isnull().sum())
print(len(df))

label     0
review    0
dtype: int64
1965


In [13]:
## Removal of empty strings by iterating
blanks = [] # Holds indexs of blank reviews

#(index,label,review)
for i,lb,rv in df.itertuples():
    if rv.isspace():
        blanks.append(i)
    elif rv is (""):
        blanks.append(i)

df.drop(blanks,inplace=True)
len(df)


1938

### Model

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
#from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [28]:
# SVC
text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', LinearSVC())])
text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)

#Metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[235  47]
 [ 41 259]]
              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       282
         pos       0.85      0.86      0.85       300

    accuracy                           0.85       582
   macro avg       0.85      0.85      0.85       582
weighted avg       0.85      0.85      0.85       582

0.8487972508591065
