In [136]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from string import punctuation, digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, classification_report

import numpy as np

In [2]:
train = pd.read_csv("train.csv", sep='|')
test = pd.read_csv("test_review.csv", sep='|')

In [3]:
train.head()

Unnamed: 0,overall,reviewText
0,0,Entertaining enough for those who don't think ...
1,1,I bought it yesterday havent started watching ...
2,1,This movie tells the story of three kids who g...
3,1,You wanna know what its like for a Black perso...
4,1,Warner Archive has finally released an epic fi...


In [5]:
train.overall.value_counts()

1    25000
0    15000
Name: overall, dtype: int64

In [34]:
X = train.reviewText
y = train.overall

In [36]:
y.head()

0    0
1    1
2    1
3    1
4    1
Name: overall, dtype: int64

In [6]:
test.head()

Unnamed: 0,index,reviewText
0,0,So many reviewers have already given you the p...
1,1,The film opens up with an ideal family. When t...
2,2,Why does everyone like this stupid movie so mu...
3,3,Fair watch. Unlike some shows it got better wi...
4,4,I thought I was buying the DVD of the Season 4...


In [37]:
X_test = test.reviewText

In [69]:
type(X_test)

pandas.core.series.Series

In [67]:
X_test.shape

(22500,)

In [177]:
# prepare punctuation and digits list for removal
translator = str.maketrans('', '', punctuation + digits)

# basic preprocessing:
def clean_data(data):
    processed = data.str.replace('\n', '').str.replace('\r', '').str.replace('\t', '')
    processed = processed.str.replace("n't", " not").str.replace("'re", " are").str.replace("'s", " s")
    processed = processed.str.replace("'ve", " have").str.replace("'ll", " will").str.replace("'d", " d")
    processed = processed.str.translate(translator).str.strip().str.lower()
    return processed

In [178]:
X_train = clean_data(X)
X_test = clean_data(X_test)

In [179]:
X_train.head()

0    entertaining enough for those who do not think too muchthere i said it  im sure i will get the u...
1    i bought it yesterday havent started watching it yet but im going to  another great season and  ...
2    this movie tells the story of three kids who grow up in a slum area of rio de janeiro hopefully ...
3    you wanna know what its like for a black person as a kid check this out its a sign of the times ...
4    warner archive has finally released an epic film which has gotten sporadic showings on turner cl...
Name: reviewText, dtype: object

In [180]:
X_train, X_my_test, y_train, y_my_test = train_test_split(X_train, y,test_size=0.2)

In [28]:
pipeline_lr = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
                     ('clf_lr', LogisticRegression(class_weight='balanced'))])

# find hyperparameters

In [89]:
params_lr = dict(clf_lr__C=[0.01, 0.1, 1, 10])
grid_search_lr = GridSearchCV(pipeline_lr, param_grid=params_lr, cv=3, scoring='f1', n_jobs=3, verbose=3)
%time grid_search_lr.fit(X_train, y_train)

CPU times: user 2min 4s, sys: 3.12 s, total: 2min 7s
Wall time: 10min 41s


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
       ...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'clf_lr__C': [0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=0)

In [105]:
grid_search_lr.best_score_

0.90691523595020362

# new pipeline based on optimized hyperparameters

In [189]:
pipeline_lr_optimized = Pipeline([('vectorizer', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2))),
    ('clf_lr', LogisticRegression(C=5, class_weight='balanced'))])

In [190]:
%%time
pipeline_lr_optimized.fit(X_train, y_train)

CPU times: user 54.2 s, sys: 8.12 s, total: 1min 2s
Wall time: 52.9 s


Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=Tr...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

# calculate cross_val_score on "test" data

In [191]:
%%time
y_predicted_lr_optimized = pipeline_lr_optimized.predict(X_my_test)
print(classification_report(y_my_test, y_predicted_lr_optimized, digits=10))

             precision    recall  f1-score   support

          0  0.8450426789 0.8623115578 0.8535897861      2985
          1  0.9170367380 0.9058823529 0.9114254188      5015

avg / total  0.8901739547 0.8896250000 0.8898454983      8000

CPU times: user 4.18 s, sys: 8 ms, total: 4.18 s
Wall time: 4.18 s


In [80]:
scores

array([ 0.91372852,  0.91063476,  0.91173503])

In [81]:
scores.mean()

0.91203276957636892

# 0.8993747350

# look into incorrectly classified stuff

In [192]:
for idx, value in enumerate(zip(y_predicted_lr_optimized, y)):
    if (value[0] != value[1]) and (idx < 10):
        print (X_my_test.iloc[idx])
        print ("predicted {}".format(value[0]))
        print ("true {}".format(value[1]))
        print ("\n")


i did not expect to love this movie i like kevin james but i was not excited about this movie based off the tv trailers i d seen honestly the trailers did not do it justice it really is funny and so sweet it had me laughing and teared up in all the right moments the cast is great the story is solid and the fight scenes were done very well  as a fan of mma i was not disappointed henry winkler is genius in this and kevin james probably does his best work to date i cannot even put it into words how much i loved this movie other than im buying the dvd immediately to add to my collection  so you should really download it or buy it too  you likely wo not regret it
predicted 1
true 0


a friend bought this film out of curiosity watched it and was going to toss it but first asked me if i wanted to see it so i borrowed it i eventually tossed it my first reaction was that it was laughable next reaction it begs for a new flm rating  pwp  or  porn with plot one reviewer here asked where they could

# predict results and submit

In [193]:
pipeline_lr_optimized.fit(clean_data(X), y)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=Tr...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [194]:
test["overall"] = pipeline_lr_optimized.predict(X_test)

In [195]:
pd.options.display.max_colwidth = 100
test.head()

Unnamed: 0,index,reviewText,overall
0,0,"So many reviewers have already given you the premise of this movie, so I will keep my review bri...",1
1,1,The film opens up with an ideal family. When the father dies in an era where seat belts are opti...,1
2,2,Why does everyone like this stupid movie so much? Why can't everyone just be called people inst...,0
3,3,Fair watch. Unlike some shows it got better with each season. Diana is great. Bad girl you love ...,1
4,4,I thought I was buying the DVD of the Season 4 of Downton Abbey... I did not read it clearly.......,0


In [196]:
for ad in range(5, 10):
    print(test.values[ad])

[5
 'I turned it off when Artie is in the filthy, nasty public restroom singing his grandson into completion of the task at hand. During the 58 minutes I endured, I never cracked a smile, much less laughed. LOSER MOVIE. $5 wasted.'
 0]
[6
 "What television series has maintained quality as consistantly as Midsomer Murders? There might be disagreement as to which episodes are the best but who cares? The best series have characters you care about and Midsomer has that. Then they add interesting characters to react to. Set 17 continues all the fine traditions.Fortunately when the final episodes reach the U.S. there will be enough to go back to Badger's Drift and start over. I am of the age where I can't remember the murderers that far back so I will have fun all over again."
 1]
[7
 'I cannot say anything about the quality of the product as I have not received it from the sender yet.'
 1]
[8
 'When DeMornay goes off in a bathroom stall and beats the heck out of it....I get scared!  This mo

In [197]:
test[["index","overall"]].to_csv("zibrov.csv", sep="|")

In [None]:
#result on test – 0.7988