# The data

In [2]:
import pandas as pd

csv_file = "data/IMDB Dataset.csv"
reviews_df = pd.read_csv(csv_file)
reviews_df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Data pre-processing

In [3]:
import nltk
import string

def preProcess(df, feature, new_column=None):
    stop_words = nltk.corpus.stopwords.words()
    punctuation = list(string.punctuation)
    
    tokenizer = nltk.tokenize.WordPunctTokenizer()
    porterStemmer = nltk.PorterStemmer()
    snowballStemmer = nltk.SnowballStemmer('english')

    pre_processed_data = []
    for data in df[feature]:
        data = data.replace('<br />', '')
        data = data.lower()
        
        tokens = tokenizer.tokenize(data)
        data = ''
        for token in tokens:
            if token not in stop_words and token not in punctuation:
                #token = porterStemmer.stem(token)
                token = snowballStemmer.stem(token)
                data += token + ' '
        
        pre_processed_data.append(data)
    
    if new_column is None:
        new_column = feature
    df[new_column] = pre_processed_data

In [None]:
preProcess(reviews_df, 'review', 'pre-processed-review')
reviews_df

# Data visualization

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

def getWords(df, feature, target, target_cut=None):
    if target_cut is not None:
        df = df.query(target + ' == "' + target_cut + '"')
    
    return ' '.join([ word for word in df[feature] ])
    
def showWordCloud(words):
    word_cloud = WordCloud(width=800, height=500, max_font_size=110, collocations=False).generate(words)

    plt.figure(figsize=(14, 7))
    plt.imshow(word_cloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
all_words = getWords(reviews_df, 'pre-processed-review', 'sentiment')
showWordCloud(all_words)

In [None]:
positive_words = getWords(reviews_df, 'pre-processed-review', 'sentiment', 'positive')
showWordCloud(positive_words)

In [None]:
negative_words = getWords(reviews_df, 'pre-processed-review', 'sentiment', 'negative')
showWordCloud(negative_words)

In [None]:
import seaborn as sns

def getFreqDist(words):
    tokenizer = nltk.tokenize.WordPunctTokenizer()
    tokens = tokenizer.tokenize(words)
    freq = nltk.FreqDist(tokens)
    return pd.DataFrame({'token': list(freq.keys()), 'freq': list(freq.values())})

def showPareto(df, n=None):
    if n is not None:
        df = df.nlargest(columns='freq', n=n)
    plt.figure(figsize=(10, 7))
    ax = sns.barplot(data=df, x='token', y='freq', color='gray')
    plt.show()

In [None]:
all_words_freq_df = getFreqDist(all_words)
showPareto(all_words_freq_df, n=10)

# Model training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

def classify(df, feature, target):
    vectorizer = TfidfVectorizer(lowercase=False)
    tfidf = vectorizer.fit_transform(df[feature])
    
    train_data, test_data, train_target, test_target = train_test_split(tfidf, df[target], random_state = 59)
    
    model = LogisticRegression(solver = 'lbfgs')
    model.fit(train_data, train_target)
    
    return model.score(test_data, test_target)

In [None]:
print("Without pre-processing:", classify(reviews_df, 'review', 'sentiment'))
print("Pre-processed:", classify(reviews_df, 'pre-processed-review', 'sentiment'))