In [1]:
from sklearn.feature_extraction.text import CountVectorizer

sandbox = ["The train from Frisco was very late. It should have arrived at Hugsons siding at midnight, but it was already five oclock and the gray dawnwas breaking in the east when the little train slowly rumbled up to theopen shed that served for the station-house. As it came to a stop theconductor called out in a loud voice: At once a little girl rose from her seat and walked to the door of thecar, carrying a wicker suit-case in one hand and a round bird-cagecovered up with newspapers in the other, while a parasol was tuckedunder her arm. The conductor helped her off the car and then theengineer started his train again, so that it puffed and groaned andmoved slowly away up the track. The reason he was so late was becauseall through the night there were times when the solid earth shook andtrembled under him, and the engineer was afraid that at any moment therails might spread apart and an accident happen to his passengers. So hemoved the cars slowly and with caution.The little girl stood still to watch until the train had disappearedaround a curve; then she turned to see where she was."]
v = CountVectorizer(ngram_range = (1,5))
v.fit(sandbox)
v.vocabulary_
#v.vocabulary

{'the': 602,
 'train': 748,
 'from': 229,
 'frisco': 224,
 'was': 813,
 'very': 798,
 'late': 362,
 'it': 345,
 'should': 513,
 'have': 272,
 'arrived': 82,
 'at': 92,
 'hugsons': 324,
 'siding': 518,
 'midnight': 388,
 'but': 129,
 'already': 15,
 'five': 214,
 'oclock': 413,
 'and': 25,
 'gray': 247,
 'dawnwas': 184,
 'breaking': 124,
 'in': 329,
 'east': 204,
 'when': 848,
 'little': 371,
 'slowly': 523,
 'rumbled': 478,
 'up': 785,
 'to': 718,
 'theopen': 693,
 'shed': 503,
 'that': 589,
 'served': 492,
 'for': 219,
 'station': 564,
 'house': 319,
 'as': 87,
 'came': 144,
 'stop': 579,
 'theconductor': 674,
 'called': 139,
 'out': 443,
 'loud': 383,
 'voice': 803,
 'once': 428,
 'girl': 238,
 'rose': 468,
 'her': 292,
 'seat': 483,
 'walked': 808,
 'door': 194,
 'of': 418,
 'thecar': 669,
 'carrying': 154,
 'wicker': 864,
 'suit': 584,
 'case': 164,
 'one': 433,
 'hand': 262,
 'round': 473,
 'bird': 119,
 'cagecovered': 134,
 'with': 869,
 'newspapers': 403,
 'other': 438,
 'while'

In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    
    filtered_token = []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_token.append(token.lemma_)
    return " ".join(filtered_token)

preprocess("He walked out of the door angry; little did he know that he is walking into danger.")

'walk door angry little know walk danger'

In [3]:
sandbox_preprocessed = [preprocess(text) for text in sandbox]
sandbox_preprocessed

['train Frisco late arrive hugson side midnight oclock gray dawnwas break east little train slowly rumble theopen shed serve station house come stop theconductor call loud voice little girl rise seat walk door thecar carry wicker suit case hand round bird cagecovere newspaper parasol tuckedunder arm conductor help car theengineer start train puff groan andmove slowly away track reason late becauseall night time solid earth shake andtremble engineer afraid moment therail spread apart accident happen passenger hemove car slowly caution little girl stand watch train disappearedaround curve turn']

In [4]:
v = CountVectorizer(ngram_range = (1,2))
v.fit(sandbox_preprocessed)
v.vocabulary_

{'train': 148,
 'frisco': 53,
 'late': 74,
 'arrive': 12,
 'hugson': 72,
 'side': 114,
 'midnight': 82,
 'oclock': 90,
 'gray': 58,
 'dawnwas': 41,
 'break': 20,
 'east': 49,
 'little': 77,
 'slowly': 116,
 'rumble': 104,
 'theopen': 140,
 'shed': 112,
 'serve': 108,
 'station': 128,
 'house': 70,
 'come': 35,
 'stop': 130,
 'theconductor': 136,
 'call': 24,
 'loud': 80,
 'voice': 156,
 'girl': 55,
 'rise': 100,
 'seat': 106,
 'walk': 158,
 'door': 45,
 'thecar': 134,
 'carry': 29,
 'wicker': 162,
 'suit': 132,
 'case': 31,
 'hand': 62,
 'round': 102,
 'bird': 18,
 'cagecovere': 22,
 'newspaper': 86,
 'parasol': 92,
 'tuckedunder': 153,
 'arm': 10,
 'conductor': 37,
 'help': 66,
 'car': 26,
 'theengineer': 138,
 'start': 126,
 'puff': 96,
 'groan': 60,
 'andmove': 4,
 'away': 14,
 'track': 146,
 'reason': 98,
 'becauseall': 16,
 'night': 88,
 'time': 144,
 'solid': 120,
 'earth': 47,
 'shake': 110,
 'andtremble': 6,
 'engineer': 51,
 'afraid': 2,
 'moment': 84,
 'therail': 142,
 'sprea

In [5]:
v.transform(sandbox_preprocessed).toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [6]:
import pandas as pd
#json("C:/Users/admin/Desktop/dataset_json/countries.geo")
df = pd.read_csv('C:/Users/admin/Desktop/archive/Book2.csv')

#print (df.shape)
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df.sentiment.value_counts()

negative    231
positive    208
Name: sentiment, dtype: int64

In [8]:
min_sample = 208

df_positive = df[df.sentiment=="positive"].sample(min_sample, random_state=2022)
df_negative = df[df.sentiment=="negative"].sample(min_sample, random_state=2022)

In [9]:
df_positive

Unnamed: 0,review,sentiment
48,Preston Sturgis' THE POWER AND THE GLORY was u...,positive
286,There have been many documentaries that I have...,positive
44,"This movie struck home for me. Being 29, I rem...",positive
73,I am not a golf fan by any means. On May 26 ab...,positive
235,Reese Witherspoon first outing on the big scre...,positive
...,...,...
389,"""Crossfire"" is a justifiably famous 1947 noir ...",positive
376,Doctor Mordrid is one of those rare films that...,positive
248,"Meryl Streep is such a genius. Well, at least ...",positive
370,"Of all the movies of the seventies, none captu...",positive


In [10]:
df_balanced = pd.concat([df_positive, df_negative], axis=0)
df_balanced

Unnamed: 0,review,sentiment
48,Preston Sturgis' THE POWER AND THE GLORY was u...,positive
286,There have been many documentaries that I have...,positive
44,"This movie struck home for me. Being 29, I rem...",positive
73,I am not a golf fan by any means. On May 26 ab...,positive
235,Reese Witherspoon first outing on the big scre...,positive
...,...,...
170,"I can't emphasize it enough, do *NOT* get this...",negative
252,Insignificant and low-brained (haha!) 80's hor...,negative
34,"I watched this film not really expecting much,...",negative
165,An American Werewolf in London had some funny ...,negative


In [11]:
df_balanced.sentiment.value_counts()

positive    208
negative    208
Name: sentiment, dtype: int64

In [12]:
df_balanced['binary_rep'] = df_balanced.sentiment.map({
    "positive":1,
    "negative":0
                                                      })

df_balanced

Unnamed: 0,review,sentiment,binary_rep
48,Preston Sturgis' THE POWER AND THE GLORY was u...,positive,1
286,There have been many documentaries that I have...,positive,1
44,"This movie struck home for me. Being 29, I rem...",positive,1
73,I am not a golf fan by any means. On May 26 ab...,positive,1
235,Reese Witherspoon first outing on the big scre...,positive,1
...,...,...,...
170,"I can't emphasize it enough, do *NOT* get this...",negative,0
252,Insignificant and low-brained (haha!) 80's hor...,negative,0
34,"I watched this film not really expecting much,...",negative,0
165,An American Werewolf in London had some funny ...,negative,0


In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df_balanced.review,
    df_balanced.binary_rep,
    test_size = .2,
    random_state=2022,
    stratify=df_balanced.binary_rep
)

In [14]:
print(x_train.shape)
x_train.head(7)

(332,)


159    This movie was not so much promoted here in Gr...
78     The few scenes that actually attempt a depicti...
352    Tell the truth Iâ€™m a bit stun to see all the...
390    AWWWW, I just love this movie to bits. Me and ...
8      Encouraged by the positive comments about this...
154    I don't think I've ever gave something a 1/10 ...
235    Reese Witherspoon first outing on the big scre...
Name: review, dtype: object

In [15]:
y_train.value_counts()

1    166
0    166
Name: binary_rep, dtype: int64

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('myvect4bow', CountVectorizer()),
    ('Multi NB lol', MultinomialNB())
    
])

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.86      0.73        42
           1       0.79      0.52      0.63        42

    accuracy                           0.69        84
   macro avg       0.71      0.69      0.68        84
weighted avg       0.71      0.69      0.68        84



In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('myvect4bow', CountVectorizer(ngram_range=(1,3))),
    ('Multi NB lol', MultinomialNB())
    
])

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.81      0.76        42
           1       0.78      0.69      0.73        42

    accuracy                           0.75        84
   macro avg       0.75      0.75      0.75        84
weighted avg       0.75      0.75      0.75        84



In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('myvect4bow', CountVectorizer(ngram_range=(1,8))),
    ('Multi NB lol', MultinomialNB())
    
])

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.76      0.75        42
           1       0.76      0.74      0.75        42

    accuracy                           0.75        84
   macro avg       0.75      0.75      0.75        84
weighted avg       0.75      0.75      0.75        84



In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('myvect4bow', CountVectorizer(ngram_range=(1,12))),
    ('Multi NB lol', MultinomialNB())
    
])

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.76      0.75        42
           1       0.76      0.74      0.75        42

    accuracy                           0.75        84
   macro avg       0.75      0.75      0.75        84
weighted avg       0.75      0.75      0.75        84



In [20]:
x_test[:5]

432    Days of Heaven is one of the most painfully bo...
280    I have decided to not believe what famous movi...
394    Helena Bonham Carter is the center of this mov...
73     I am not a golf fan by any means. On May 26 ab...
110    Apparently, the people that wrote the back of ...
Name: review, dtype: object

In [21]:
y_test[:5]

432    0
280    1
394    1
73     1
110    0
Name: binary_rep, dtype: int64

In [22]:
y_pred[:5]

array([0, 1, 1, 1, 0], dtype=int64)

In [23]:
df_balanced['preprocessed_reviews']=df_balanced.review.apply(preprocess)
df_balanced
df_balanced.head()

Unnamed: 0,review,sentiment,binary_rep,preprocessed_reviews
48,Preston Sturgis' THE POWER AND THE GLORY was u...,positive,1,Preston Sturgis power GLORY unseen public near...
286,There have been many documentaries that I have...,positive,1,documentary see appear law wrong fence thin bl...
44,"This movie struck home for me. Being 29, I rem...",positive,1,movie strike home 29 remember 80 father work f...
73,I am not a golf fan by any means. On May 26 ab...,positive,1,golf fan mean 26 10:30 pm movie start scene la...
235,Reese Witherspoon first outing on the big scre...,positive,1,reese Witherspoon outing big screen memorable ...


In [24]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_reviews,
    df_balanced.binary_rep,
    test_size = .2,
    random_state=2022,
    stratify=df_balanced.binary_rep
)

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('myvect4bow', CountVectorizer(ngram_range=(1,3))),
    ('Multi NB lol', MultinomialNB())
    
])

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.74      0.75        42
           1       0.74      0.76      0.75        42

    accuracy                           0.75        84
   macro avg       0.75      0.75      0.75        84
weighted avg       0.75      0.75      0.75        84

