In [70]:
import numpy as np
import pandas as pd

In [94]:
df = pd.read_csv('Datasets/IMDB_Dataset.csv')
df = df[:10000]
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [95]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [96]:
df['sentiment'].value_counts()

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [97]:
len(df)

10000

In [98]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [99]:
df.duplicated().sum()

17

In [100]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [101]:
# Basic Preprocessing
# Remove tags
# lowercase
# remove stopwords

In [102]:
import re
def remove_tags (text):
    return re.compile(r'<[^>]+>').sub('', text)

In [103]:
df['review'] = df['review'].apply(lambda x: remove_tags(x))
df['review'][1]

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

In [104]:
df['review'] = df['review'].apply(lambda x: x.lower())
df['review'][1]

'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

In [105]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['review'][1]

'wonderful little production. filming technique unassuming- old-time-bbc fashion gives comforting, sometimes discomforting, sense realism entire piece. actors extremely well chosen- michael sheen "has got polari" voices pat too! truly see seamless editing guided references williams\' diary entries, well worth watching terrificly written performed piece. masterful production one great master\'s comedy life. realism really comes home little things: fantasy guard which, rather use traditional \'dream\' techniques remains solid disappears. plays knowledge senses, particularly scenes concerning orton halliwell sets (particularly flat halliwell\'s murals decorating every surface) terribly well done.'

In [106]:
X = df['review']
y = df['sentiment']

In [108]:
X

0       one reviewers mentioned watching 1 oz episode ...
1       wonderful little production. filming technique...
2       thought wonderful way spend time hot summer we...
3       basically there's family little boy (jake) thi...
4       petter mattei's "love time money" visually stu...
                              ...                        
9995    fun, entertaining movie wwii german spy (julie...
9996    give break. anyone say "good hockey movie"? kn...
9997    movie bad movie. watching endless series bad h...
9998    movie probably made entertain middle school, e...
9999    smashing film film-making. shows intense stran...
Name: review, Length: 9983, dtype: object

In [109]:
y

0       positive
1       positive
2       positive
3       negative
4       positive
          ...   
9995    positive
9996    negative
9997    negative
9998    negative
9999    positive
Name: sentiment, Length: 9983, dtype: object

In [110]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)
print(y,', ',len(y))

[1 1 1 ... 0 0 1] ,  9983


In [111]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [112]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(7986,) (7986,) (1997,) (1997,)


In [113]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [124]:
X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

In [125]:
print(X_train_bow.shape, X_test_bow.shape)

(7986, 48382) (1997, 48382)


In [126]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)

In [127]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.6339509263895844

In [128]:
confusion_matrix(y_test,y_pred)

array([[716, 265],
       [466, 550]], dtype=int64)

In [129]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8582874311467201

In [None]:
cv = CountVectorizer(max_features=3000)

In [131]:
X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8467701552328493

In [132]:
cv = CountVectorizer(ngram_range=(1,2),max_features=5000)

X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8482724086129194

In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [135]:
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test)

In [136]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test,y_pred)

0.8532799198798198

In [137]:
df = pd.read_csv('Datasets/IMDB_Dataset.csv')
df = df[:10000]
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [138]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [139]:
import re
def remove_tags (text):
    return re.compile(r'<[^>]+>').sub('', text)

In [140]:
df['review'] = df['review'].apply(lambda x: remove_tags(x))
df['review'][1]
df['review'] = df['review'].apply(lambda x: x.lower())
df['review'][1]
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# df['review'][1]
# X = df['review']
# y = df['sentiment']
# from sklearn.preprocessing import LabelEncoder
# encoder = LabelEncoder()
# y = encoder.fit_transform(y)
# print(y,', ',len(y))
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [141]:
import gensim
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [142]:
complete_story = []
for doc in df['review']:
    raw_sentence = sent_tokenize(doc)
    for sent in raw_sentence:
        complete_story.append(simple_preprocess(sent))

In [144]:
model = gensim.models.Word2Vec(
    window = 10,
    min_count = 2
)

In [145]:
model.build_vocab(complete_story)

In [146]:
model.train(complete_story, total_examples=model.corpus_count, epochs=10)

(11750955, 12424280)

In [147]:
len(model.wv.index_to_key)

31845

In [148]:
def document_vector (doc):
    # remove OOV words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

In [149]:
document_vector(df['review'].values[0])

array([-0.17618898,  0.59432995, -0.11193826,  0.14323118,  0.00098438,
       -0.32391328,  0.16502604,  0.8171667 , -0.4220515 , -0.12245898,
       -0.22702357, -0.1976225 ,  0.06246341,  0.05112233,  0.08156557,
        0.02737912, -0.01729523, -0.14920142, -0.03246663, -0.2124446 ,
        0.06619079,  0.15811004, -0.03739693, -0.25731114, -0.50885874,
        0.10593191, -0.2353422 ,  0.18804525, -0.25493976, -0.0231316 ,
        0.21920486,  0.04989646,  0.1241619 , -0.3820471 ,  0.12655595,
        0.3245479 , -0.07314532, -0.15809363, -0.14909288, -0.78981394,
        0.13707037, -0.11837059,  0.16108017, -0.1960084 ,  0.6015243 ,
       -0.07642274, -0.11371854, -0.08831155, -0.0797421 ,  0.37961861,
        0.03769771, -0.24970976, -0.18644282,  0.05408705,  0.04727628,
        0.3247071 ,  0.06981197,  0.29022694, -0.15173371,  0.14540112,
        0.22821134, -0.0560642 ,  0.15510869,  0.07803314, -0.2595126 ,
        0.06241371, -0.02643391,  0.33648688, -0.27920726,  0.25

In [150]:
from tqdm import tqdm

In [151]:
X = []
for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

100%|██████████| 9983/9983 [05:56<00:00, 28.03it/s]  


In [152]:
X = np.array(X)

In [153]:
X.shape

(9983, 100)

In [154]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(df['sentiment'])
print(y,', ',len(y))

[1 1 1 ... 0 0 1] ,  9983


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [155]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)

In [156]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.8077115673510266

In [157]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)

In [158]:
y_pred = rf.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.8522784176264396