In [49]:
import pandas as pd 
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import pymysql

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bandy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [50]:
db_connection = pymysql.connect(host='localhost', database='movie_reviews', user='bandytan', password='bt2102')

db_cursor = db_connection.cursor()

# movie1 = '10 Cloverfield Lane 2016'
# movie2 = '10 Things I Hate About You 1999'
# movie3 = '12 Angry Men 1957'

db_cursor.execute("SELECT * FROM review_data WHERE movie = '12 Angry Men 1957'")

table_rows = db_cursor.fetchall()

movie1 = pd.DataFrame(table_rows)

In [51]:
movie1.set_axis(['index', 'movie', 'rating', 'helpful', 'review'], axis=1, inplace=True)
movie1.isnull().values.any() #Check if data has any null values

False

In [52]:
movie1.head()

Unnamed: 0,index,movie,rating,helpful,review
0,507437,12 Angry Men 1957,1.0,764,Excellent An excellent courtroom drama with a...
1,507438,12 Angry Men 1957,1.0,590,"No bombs, no car chases but edge of the seat ..."
2,507439,12 Angry Men 1957,0.777778,508,Simple but great. '12 Angry Men' is an outsta...
3,507440,12 Angry Men 1957,1.0,363,If you only ever see one Black and White movi...
4,507441,12 Angry Men 1957,1.0,418,Should be in everyone's top ten list of great...


In [53]:
def process_review(column):
    column_processed = []
    for row in column:
        row_processed = re.sub(r'[^\w\s]', '', row) # remove punctuation
        column_processed.append(row_processed)
    return column_processed
movie1["review"] = process_review(movie1["review"])
# movie1.iloc[0,4]

In [54]:
class LemmatizeTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, text):
        return [self.lemmatizer.lemmatize(word) for word in word_tokenize(text)]

In [55]:
def generate_nlp_features(df):
    vect = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )

    review = pd.Series(df["review"])
    tfidf_fit_review = vect.fit(review)
    tfidf_array = tfidf_fit_review.transform(review).toarray()
    tfidf_df = pd.DataFrame(tfidf_array)
    tfidf_df.columns = list(map(lambda x : "review_" + str(x), tfidf_df.columns))
    df = pd.merge(df, tfidf_df , left_index=True, right_index=True)

    feature_array = np.array(vect.get_feature_names())
    tfidf_sorting = np.argsort(tfidf_array).flatten()[::-1]
    n = 100 # display 100 most important words
    top_n = feature_array[tfidf_sorting][:n]
    tfidf_feature_names = list(zip(range(100), top_n))
    return (df, tfidf_feature_names)

In [56]:
df,  tfidf_feature_names= generate_nlp_features(movie1)
tfidf_feature_names



[(0, 'actor'),
 (1, 'make'),
 (2, 'film'),
 (3, 'doe'),
 (4, 'end'),
 (5, 'murder'),
 (6, 'performance'),
 (7, 'dont'),
 (8, 'much'),
 (9, 'every'),
 (10, 'even'),
 (11, 'case'),
 (12, 'story'),
 (13, 'good'),
 (14, 'ha'),
 (15, 'time'),
 (16, 'character'),
 (17, 'jury'),
 (18, 'movie'),
 (19, 'fonda'),
 (20, 'first'),
 (21, 'get'),
 (22, 'go'),
 (23, 'great'),
 (24, 'feel'),
 (25, 'guilty'),
 (26, 'henry'),
 (27, 'father'),
 (28, 'fact'),
 (29, 'henry fonda'),
 (30, 'evidence'),
 (31, 'young'),
 (32, 'drama'),
 (33, 'ever'),
 (34, 'best'),
 (35, '12 angry'),
 (36, '12 angry men'),
 (37, '8'),
 (38, 'accused'),
 (39, 'acting'),
 (40, 'also'),
 (41, 'angry'),
 (42, 'angry men'),
 (43, 'boy'),
 (44, 'juror'),
 (45, 'cast'),
 (46, 'classic'),
 (47, 'cobb'),
 (48, 'could'),
 (49, 'decision'),
 (50, 'dialogue'),
 (51, 'different'),
 (52, 'doubt'),
 (53, 'j'),
 (54, 'lee'),
 (55, 'juror 8'),
 (56, 'say'),
 (57, 'seen'),
 (58, 'show'),
 (59, 'sidney'),
 (60, 'simple'),
 (61, 'still'),
 (62, '

In [57]:
#Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement.
df["review_polarity"] = df["review"].apply(lambda x: TextBlob(x).sentiment.polarity)  
df.head()

Unnamed: 0,index,movie,rating,helpful,review,review_0,review_1,review_2,review_3,review_4,...,review_91,review_92,review_93,review_94,review_95,review_96,review_97,review_98,review_99,review_polarity
0,507437,12 Angry Men 1957,1.0,764,Excellent An excellent courtroom drama with a...,0.073196,0.0,0.0,0.127861,0.042115,...,0.085835,0.0,0.086137,0.0,0.106556,0.073959,0.0,0.0,0.170188,0.203678
1,507438,12 Angry Men 1957,1.0,590,No bombs no car chases but edge of the seat s...,0.0,0.0,0.0,0.0,0.0,...,0.0,0.068311,0.0,0.0,0.0,0.097973,0.0,0.0,0.0,0.109524
2,507439,12 Angry Men 1957,0.777778,508,Simple but great 12 Angry Men is an outstandi...,0.085728,0.103645,0.105328,0.0,0.073988,...,0.0,0.090595,0.0,0.0,0.0624,0.064967,0.0,0.07161,0.0,0.091697
3,507440,12 Angry Men 1957,1.0,363,If you only ever see one Black and White movi...,0.0,0.0,0.0,0.0,0.0,...,0.0,0.085618,0.0,0.0,0.0,0.0,0.0,0.067676,0.070641,0.177669
4,507441,12 Angry Men 1957,1.0,418,Should be in everyones top ten list of greate...,0.084692,0.051196,0.052027,0.0,0.0,...,0.0,0.2685,0.0,0.057044,0.061646,0.0,0.0,0.0,0.0,0.279818


In [58]:
print(min(df.review_polarity))
print(max(df.review_polarity))

-1.0
1.0


In [59]:
# Export feature names as txt
with open('movie3_tfidf_feature_names.txt', 'a') as f:
    f.write(str(tfidf_feature_names))

# Export dataframe as csv
df.to_csv("movie3.csv", index=False)
