In [3]:
import pandas as pd
import string
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv('aux/imdb.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

In [6]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [7]:
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:
def clean(text):
    text = text.lower()
    text = re.sub(r'http\S+', " ", text) 
    text = re.sub(r'#\w+', ' ', text) 
    text = re.sub(r'\d+', ' ', text) 
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'<.*?>',' ', text) 
    return text

In [10]:
text = df['review']
text.head(5)

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [11]:
treated = text.apply(clean)
treated

0        one of the other reviewers has mentioned that ...
1        a wonderful little production br br the filmin...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

In [12]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

In [13]:
wn_lemmatizer = WordNetLemmatizer()

In [15]:
lemmatized_text = []
for review in treated:
    lemmatized_text.append(' '.join([wn_lemmatizer.lemmatize(word) for word in review.split()]))

In [17]:
reg_tokenizer = RegexpTokenizer('\w+')

In [18]:
tokenized_text = reg_tokenizer.tokenize_sents(lemmatized_text)
tokenized_text[56]

['the',
 'hill',
 'have',
 'eye',
 'ii',
 'is',
 'what',
 'you',
 'would',
 'expect',
 'it',
 'to',
 'be',
 'and',
 'nothing',
 'more',
 'of',
 'course',
 'it',
 'not',
 'going',
 'to',
 'be',
 'an',
 'oscar',
 'nominated',
 'film',
 'it',
 'just',
 'pure',
 'entertainment',
 'which',
 'you',
 'can',
 'just',
 'lose',
 'yourself',
 'in',
 'for',
 'minutesbr',
 'br',
 'the',
 'plot',
 'is',
 'basically',
 'about',
 'a',
 'group',
 'of',
 'national',
 'guard',
 'trainee',
 'who',
 'find',
 'themselves',
 'battling',
 'against',
 'the',
 'notorious',
 'mutated',
 'hillbilly',
 'on',
 'their',
 'last',
 'day',
 'of',
 'training',
 'in',
 'the',
 'desert',
 'it',
 'just',
 'them',
 'fighting',
 'back',
 'throughout',
 'the',
 'whole',
 'film',
 'which',
 'includes',
 'a',
 'lot',
 'of',
 'violence',
 'which',
 'is',
 'basically',
 'the',
 'whole',
 'film',
 'a',
 'blood',
 'and',
 'gut',
 'are',
 'constantly',
 'flying',
 'around',
 'throughout',
 'the',
 'whole',
 'thing',
 'and',
 'also',

In [19]:
sw = stopwords.words('english')

In [20]:
clean_tokenized_tweets = [] 
for i, element in enumerate(tokenized_text):
    clean_tokenized_tweets.append(' '.join([word for word in element if word not in sw]))

In [21]:
new_df = pd.concat([pd.Series(clean_tokenized_tweets, name='review'), 
                pd.Series(df['sentiment'], name='sentiment')], 
               axis=1)
new_df.head()

Unnamed: 0,review,sentiment
0,one reviewer ha mentioned watching oz episode ...,1
1,wonderful little production br br filming tech...,1
2,thought wa wonderful way spend time hot summer...,1
3,basically family little boy jake think zombie ...,0
4,petter matteis love time money visually stunni...,1


In [22]:
cvec = CountVectorizer(ngram_range=(1, 1))
tfid = TfidfVectorizer(ngram_range=(1, 1))

In [23]:
cvec_representation = cvec.fit_transform(pd.Series(clean_tokenized_tweets))
tfid_representation = tfid.fit_transform(pd.Series(clean_tokenized_tweets))

In [24]:
""" from sklearn.ensemble import RandomForestClassifier """
from sklearn.linear_model import LogisticRegression
""" from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier """

' from sklearn.naive_bayes import GaussianNB\nfrom catboost import CatBoostClassifier '

In [25]:
""" rfc = RandomForestClassifier() """
lrc = LogisticRegression(max_iter=1000)
""" gnb = GaussianNB()
cbc = CatBoostClassifier() """

' gnb = GaussianNB()\ncbc = CatBoostClassifier() '

In [26]:
""" X_train, X_test, y_train, y_test = train_test_split(cvec_representation, new_df['sentiment'], test_size=0.2, random_state=900) """

" X_train, X_test, y_train, y_test = train_test_split(cvec_representation, new_df['sentiment'], test_size=0.2, random_state=900) "

In [27]:
""" rfc.fit(X_train, y_train)
cvec_rfc_pred = rfc.predict(X_test)
cvec_rfc_accuracy = accuracy_score(y_test, cvec_rfc_pred)
cvec_rfc_accuracy """

' rfc.fit(X_train, y_train)\ncvec_rfc_pred = rfc.predict(X_test)\ncvec_rfc_accuracy = accuracy_score(y_test, cvec_rfc_pred)\ncvec_rfc_accuracy '

In [28]:
""" lrc.fit(X_train, y_train)
cvec_lrc_pred = lrc.predict(X_test)
cvec_lrc_accuracy = accuracy_score(y_test, cvec_lrc_pred)
cvec_lrc_accuracy """

' lrc.fit(X_train, y_train)\ncvec_lrc_pred = lrc.predict(X_test)\ncvec_lrc_accuracy = accuracy_score(y_test, cvec_lrc_pred)\ncvec_lrc_accuracy '

In [29]:
""" cbc.fit(X_train, y_train)
cvec_cbc_pred = cbc.predict(X_test)
cvec_cbc_accuracy = accuracy_score(y_test, cvec_cbc_pred)
cvec_cbc_accuracy """

' cbc.fit(X_train, y_train)\ncvec_cbc_pred = cbc.predict(X_test)\ncvec_cbc_accuracy = accuracy_score(y_test, cvec_cbc_pred)\ncvec_cbc_accuracy '

In [30]:
""" gnb.fit(X_train.toarray(), y_train)
cvec_gnb_pred = gnb.predict(X_test)
cvec_gnb_accuracy = accuracy_score(y_test, cvec_gnb_pred)
cvec_gnb_accuracy """

' gnb.fit(X_train.toarray(), y_train)\ncvec_gnb_pred = gnb.predict(X_test)\ncvec_gnb_accuracy = accuracy_score(y_test, cvec_gnb_pred)\ncvec_gnb_accuracy '

In [31]:
X_train, X_test, y_train, y_test = train_test_split(tfid_representation, new_df['sentiment'], test_size=0.2, random_state=900)

In [32]:
""" rfc.fit(X_train, y_train)
tfid_rfc_pred = rfc.predict(X_test)
tfid_rfc_accuracy = accuracy_score(y_test, tfid_rfc_pred)
tfid_rfc_accuracy """

' rfc.fit(X_train, y_train)\ntfid_rfc_pred = rfc.predict(X_test)\ntfid_rfc_accuracy = accuracy_score(y_test, tfid_rfc_pred)\ntfid_rfc_accuracy '

In [33]:
lrc = LogisticRegression(max_iter=1000)
lrc.fit(X_train, y_train)
tfid_lrc_pred = lrc.predict(X_test)
tfid_lrc_accuracy = accuracy_score(y_test, tfid_lrc_pred)
tfid_lrc_accuracy

0.8951

In [34]:
""" cbc.fit(X_train, y_train)
tfid_cbc_pred = cbc.predict(X_test)
tfid_cbc_accuracy = accuracy_score(y_test, tfid_cbc_pred)
tfid_cbc_accuracy """

' cbc.fit(X_train, y_train)\ntfid_cbc_pred = cbc.predict(X_test)\ntfid_cbc_accuracy = accuracy_score(y_test, tfid_cbc_pred)\ntfid_cbc_accuracy '

In [35]:
""" gnb.fit(X_train, y_train)
cvec_gnb_pred = gnb.predict(X_test)
cvec_gnb_accuracy = accuracy_score(y_test, cvec_gnb_pred)
cvec_gnb_accuracy """

' gnb.fit(X_train, y_train)\ncvec_gnb_pred = gnb.predict(X_test)\ncvec_gnb_accuracy = accuracy_score(y_test, cvec_gnb_pred)\ncvec_gnb_accuracy '

In [36]:
from joblib import dump
dump(tfid_representation, 'tfid_representation.pkl')
dump(lrc, 'lrc.pkl')
dump(tfid, 'tfid.pkl')

['tfid.pkl']

In [37]:
tfid_for_bot = tfid.fit(pd.Series(clean_tokenized_tweets))

In [38]:
dump(tfid_for_bot, 'tfid_for_bot.pkl')

['tfid_for_bot.pkl']