https://www.cs.cornell.edu/people/pabo/movie-review-data/

In [1]:
import pandas as pd
import numpy as np 
import nltk
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

from wordcloud import WordCloud
import matplotlib.pyplot as plt, seaborn as sb
import os
from glob import glob

# nltk.download('stopwords')
# nltk.download('wordnet')
import warnings
warnings.filterwarnings("ignore")

# !pip install swifter
import swifter

In [None]:
# !pip install nltk

In [6]:
pos_files = glob('./Data/txt_sentoken/pos/*')
neg_files = glob('./Data/txt_sentoken/neg/*')

def read_txt(FilePath):
    with open(FilePath,'r') as fp:
        txt = fp.read()
    return txt

POS_TXTS = [read_txt(i) for i in pos_files]
NEG_TXTS = [read_txt(i) for i in neg_files]

In [7]:
len(POS_TXTS) , len(NEG_TXTS)

(1000, 1000)

In [None]:
pd.DataFrame()

In [10]:
Datax = pd.DataFrame(list(zip(POS_TXTS,[1]*len(POS_TXTS))) + list(zip(NEG_TXTS,[0]*len(NEG_TXTS))), columns = ['text','target'])
Datax.head()

Unnamed: 0,text,target
0,films adapted from comic books have had plenty...,1
1,every now and then a movie comes along from a ...,1
2,you've got mail works alot better than it dese...,1
3,""" jaws "" is a rare film that grabs your atten...",1
4,moviemaking is a lot like being the general ma...,1


In [11]:
Datax.target.value_counts()

1    1000
0    1000
Name: target, dtype: int64

In [18]:
## Basic Cleaning
Datax['text'] = Datax['text'].str.lower()

Datax['text'] = Datax['text'].str.replace('[^a-z]',' ',regex=True)
Datax['text'] = Datax['text'].str.replace(' +',' ',regex=True)
Datax['text'] = Datax['text'].str.strip()
Datax.iloc[0]['text']

'films adapted from comic books have had plenty of success whether they re about superheroes batman superman spawn or geared toward kids casper or the arthouse crowd ghost world but there s never really been a comic book like from hell before for starters it was created by alan moore and eddie campbell who brought the medium to a whole new level in the mid s with a part series called the watchmen to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd the book or graphic novel if you will is over pages long and includes nearly more that consist of nothing but footnotes in other words don t dismiss this film because of its source if you can get past the whole comic book thing you might find another stumbling block in from hell s directors albert and allen hughes getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in well anything but riddle me this who bett

In [20]:
sw = stopwords.words('english')
lem = WordNetLemmatizer()
ps = nltk.stem.PorterStemmer()

In [32]:
from nltk.corpus import wordnet
def get_pos(x):
    if x.startswith('J'):
        return wordnet.ADJ
    elif x.startswith('V'):
        return wordnet.VERB
    elif x.startswith('N'):
        return wordnet.NOUN
    elif x.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def text_process(text,stem=True):
    tokens = word_tokenize(text)
    tokens = list(filter(lambda x:x not in sw,tokens))
    if stem:
        tokens = [ps.stem(t) for t in tokens]
    else:
        tokens = [lem.lemmatize(t,get_pos(pt)) for t,pt in pos_tag(tokens)]
    
    processed_text = ' '.join(tokens)
    return processed_text

In [34]:
# text_process('I need the books', stem=False)

In [39]:
Datax['text'] = Datax['text'].swifter.apply(lambda x:text_process(x,stem=False))

Pandas Apply:   0%|          | 0/2000 [00:00<?, ?it/s]

In [42]:
POS_WORDS = ' '.join(Datax[Datax.target==1].text.values.tolist())
POS_WORDS = word_tokenize(POS_WORDS)

NEG_WORDS = ' '.join(Datax[Datax.target==0].text.values.tolist())
NEG_WORDS = word_tokenize(NEG_WORDS)

In [43]:
len(NEG_WORDS) , len(POS_WORDS)

(330448, 371976)

In [44]:
len(set(NEG_WORDS)) , len(set(POS_WORDS))

(22871, 24580)

In [46]:
nltk.FreqDist(POS_WORDS).most_common(10)

[('film', 6168),
 ('movie', 3163),
 ('one', 3156),
 ('make', 2175),
 ('character', 2064),
 ('like', 1931),
 ('see', 1788),
 ('get', 1734),
 ('time', 1576),
 ('go', 1478)]

In [47]:
nltk.FreqDist(NEG_WORDS).most_common(10)

[('film', 4980),
 ('movie', 3818),
 ('one', 2874),
 ('make', 2085),
 ('like', 2009),
 ('get', 1993),
 ('character', 1815),
 ('go', 1564),
 ('time', 1427),
 ('even', 1402)]

In [56]:
POS_DIST = pd.DataFrame(nltk.FreqDist(POS_WORDS).items(),columns = ['word','wordcount_pos'])
POS_DIST = POS_DIST.set_index('word')

NEG_DIST = pd.DataFrame(nltk.FreqDist(NEG_WORDS).items(),columns = ['word','wordcount_neg'])
NEG_DIST = NEG_DIST.set_index('word')


COM_DIST = POS_DIST.join(NEG_DIST)

In [62]:
COM_DIST = COM_DIST.fillna(1)
COM_DIST = COM_DIST[(COM_DIST.wordcount_pos+COM_DIST.wordcount_neg)>100]
COM_DIST['ratio'] = np.log(COM_DIST.wordcount_pos/COM_DIST.wordcount_neg)

COM_DIST.sort_values(by='ratio', ascending=True).head(15)

Unnamed: 0_level_0,wordcount_pos,wordcount_neg,ratio
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
godzilla,14,120.0,-2.148434
lame,15,102.0,-1.916923
waste,42,229.0,-1.696052
ridiculous,22,118.0,-1.679642
awful,21,111.0,-1.665008
dull,24,112.0,-1.540445
stupid,46,213.0,-1.532651
boring,22,99.0,-1.504077
terrible,28,115.0,-1.412728
harry,43,144.0,-1.208613


In [63]:
COM_DIST.sort_values(by='ratio', ascending=False).head(15)

Unnamed: 0_level_0,wordcount_pos,wordcount_neg,ratio
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
truman,152,11.0,2.625985
titanic,112,23.0,1.583005
cameron,133,30.0,1.489152
spielberg,102,24.0,1.446919
jackie,229,54.0,1.444738
pulp,86,21.0,1.409825
memorable,118,29.0,1.403389
toy,122,30.0,1.402824
tarantino,97,24.0,1.396657
terrific,96,24.0,1.386294



## Text Preprocessing

The text needs to be transformed to vectors so as the algorithms will be able make predictions. In this case it will be used the Term Frequency – Inverse Document Frequency (TFIDF) weight to evaluate __how important a word is to a document in a collection of documents__.

After removing __punctuation__ and __lower casing__ the words, importance of a word is determined in terms of its frequency.



### “Term Frequency – Inverse Document Frequency 

__TF-IDF__ is the product of the __TF__ and __IDF__ scores of the term.<br><br> $$\text{TF-IDF}=\frac{\text{TF}}{\text{IDF}}$$<br>

__Term Frequency :__ This summarizes how often a given word appears within a document.

$$\text{TF} = \frac{\text{Number of times the term appears in the doc}}{\text{Total number of words in the doc}}$$<br><br>
__Inverse Document Frequency:__ This downscales words that appear a lot across documents. A term has a high IDF score if it appears in a few documents. Conversely, if the term is very common among documents (i.e., “the”, “a”, “is”), the term would have a low IDF score.<br>

$$\text{IDF} = \ln\left(\frac{\text{Number of docs}}{\text{Number docs the term appears in}} \right)$$<br>

TF-IDF are word frequency scores that try to highlight words that are more interesting, e.g. frequent in a document but not across documents. The higher the TFIDF score, the rarer the term is. For instance, in a Mortgage complaint the word _mortgage_ would be mentioned fairly often. However, if we look at other complaints, _mortgage_ probably would not show up in many of them. We can infer that _mortgage_ is most probably an important word in Mortgage complaints as compared to the other products. Therefore, _mortgage_ would have a high TF-IDF score for Mortgage complaints.

TfidfVectorizer class can be initialized with the following parameters:
* __min_df__: remove the words from the vocabulary which have occurred in less than ‘min_df’ number of files.
* __max_df__: remove the words from the vocabulary which have occurred in more than _‘max_df’ * total number of files in corpus_.
* __sublinear_tf__: set to True to scale the term frequency in logarithmic scale.
* __stop_words__: remove the predefined stop words in 'english'.
* __use_idf__: weight factor must use inverse document frequency.
* __ngram_range__: (1, 2) to indicate that unigrams and bigrams will be considered
* __max_features__: maximum number of features


In [70]:
x = Datax.text.values
y = Datax.target.values

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [71]:
td = TfidfVectorizer(max_features = 500)
x_train = td.fit_transform(x_train)
x_test = td.transform(x_test)

In [72]:
x_train.shape , x_test.shape

((1600, 500), (400, 500))

In [78]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

## Train
clf.fit(x_train,y_train)

## Test
y_pred = clf.predict(x_test)

In [79]:
from sklearn.metrics import accuracy_score,classification_report

clf_report = classification_report(y_test,y_pred)
print(F'ACC : {accuracy_score(y_test,y_pred)}')
print(clf_report)

ACC : 0.7675
              precision    recall  f1-score   support

           0       0.76      0.79      0.77       200
           1       0.78      0.75      0.76       200

    accuracy                           0.77       400
   macro avg       0.77      0.77      0.77       400
weighted avg       0.77      0.77      0.77       400

