In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn import metrics

In [2]:
df = pd.read_csv("IMDB_movie_reviews_labeled.csv")

In [3]:
df.shape

(10000, 2)

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,"From the beginning of this film,with it's ""The...",negative
1,1 hour and 40 minutes of talking--boring talki...,negative
2,I watched 40 minutes and couldn't bear it any ...,negative
3,Jim Carrey is back to much the same role that ...,positive
4,This is a silly spoof of private eye thrillers...,negative


In [5]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

In [6]:
df.sentiment.value_counts()

negative    5000
positive    5000
Name: sentiment, dtype: int64

In [7]:
X = df.loc[:,['review']]
y = df.sentiment

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y)

In [9]:
y_train.value_counts()

negative    3500
positive    3500
Name: sentiment, dtype: int64

In [10]:
X_train_docs = [doc for doc in X_train.review]

In [11]:
vect = CountVectorizer(ngram_range=(1, 3), stop_words="english", max_features=1000).fit(X_train_docs)

In [12]:
X_train_features = vect.transform(X_train_docs)

In [13]:
print("X_train_features:\n{}".format(repr(X_train_features)))

X_train_features:
<7000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 332253 stored elements in Compressed Sparse Row format>


In [14]:
feature_names = vect.get_feature_names()




In [15]:
print("Number of features: {}".format(len(feature_names)))
print("First 100 features:\n{}".format(feature_names[:100]))
print("Every 100th feature:\n{}".format(feature_names[::100]))


Number of features: 1000
First 100 features:
['10', '100', '15', '20', '30', '80', '90', 'able', 'absolutely', 'act', 'acted', 'acting', 'action', 'actor', 'actors', 'actress', 'actual', 'actually', 'add', 'admit', 'adventure', 'age', 'ago', 'agree', 'air', 'alive', 'amazing', 'america', 'american', 'animated', 'animation', 'annoying', 'anti', 'apart', 'apparently', 'appear', 'appearance', 'appears', 'appreciate', 'aren', 'art', 'aside', 'ask', 'atmosphere', 'attempt', 'attempts', 'attention', 'audience', 'available', 'average', 'avoid', 'away', 'awful', 'baby', 'background', 'bad', 'bad movie', 'badly', 'band', 'barely', 'based', 'basically', 'battle', 'beautiful', 'beauty', 'begin', 'beginning', 'begins', 'believable', 'believe', 'best', 'better', 'big', 'biggest', 'bit', 'bizarre', 'black', 'blood', 'body', 'book', 'bored', 'boring', 'box', 'boy', 'boys', 'br', 'br 10', 'br br', 'br br 10', 'br br film', 'br br movie', 'br br plot', 'br br story', 'br don', 'br film', 'br movie', 'b

In [16]:
lin_svc = LinearSVC(max_iter=120000)

In [17]:
scores = cross_val_score(lin_svc, X_train_features, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

In [None]:
lin_svc.fit(X_train_features, y_train)

In [None]:
X_test_docs = [doc for doc in X_test.review]
X_test_features = vect.transform(X_test_docs)

In [None]:
y_test_pred = lin_svc.predict(X_test_features)


In [None]:
metrics.accuracy_score(y_test, y_test_pred)