<a href="https://colab.research.google.com/github/adamquah/NLP-Coursework/blob/main/Sentiment_analysis_(movie).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load Libraries

In [None]:
import os
import numpy as np
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import regex as re
from string import punctuation
import math

import nltk
nltk.download("omw-1.4")
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
data = pd.read_csv('Movies.csv')

In [None]:
data.head(10)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [None]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# 1.0 Data Cleaning

In [None]:
data[data['review'].duplicated() == True]

Unnamed: 0,review,sentiment
3537,Quite what the producers of this appalling ada...,negative
3769,My favourite police series of all time turns t...,positive
4391,"Beautiful film, pure Cassavetes style. Gena Ro...",positive
6352,If you liked the Grinch movie... go watch that...,negative
6479,I want very much to believe that the above quo...,negative
...,...,...
49912,This is an incredible piece of drama and power...,positive
49950,This was a very brief episode that appeared in...,negative
49984,Hello it is I Derrick Cannon and I welcome you...,negative
49986,This movie is a disgrace to the Major League F...,negative


In [None]:
data.drop_duplicates(subset='review', inplace=True)

In [None]:
data.describe()

Unnamed: 0,review,sentiment
count,49582,49582
unique,49582,2
top,One of the other reviewers has mentioned that ...,positive
freq,1,24884


In [None]:
def remove_punc(series):
    temp = re.sub(f'[{punctuation}]', '', series)
    temp = re.sub(' br br ',' ', temp)
    temp = re.sub(' n ',' ', temp)
    return temp

data['review'] = data['review'].apply(remove_punc)

In [None]:
data[['review']].head()


Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production The filming tech...
2,I thought this was a wonderful way to spend ti...
3,Basically theres a family where a little boy J...
4,Petter Matteis Love in the Time of Money is a ...


In [None]:
def remove_stop(series):
    return ' '.join([x.lower() for x in series.split(' ') if x.lower() not in STOPWORDS])

data['review'] = data['review'].apply(remove_stop)

In [None]:
data[['review']]

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...
4,petter matteis love time money visually stunni...
...,...
49995,thought movie right good job wasnt creative or...
49996,bad plot bad dialogue bad acting idiotic direc...
49997,catholic taught parochial elementary schools n...
49998,im going disagree previous comment side maltin...


# 2.0 Tokenization

In [None]:
pip install nltk



In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk.tokenize import word_tokenize

def tokenize(text):
    return word_tokenize(text)

data['tokens'] = data['review'].apply(tokenize)


In [None]:
data['token_len'] = data['tokens'].apply(len)
data[['tokens','token_len']].head()

Unnamed: 0,tokens,token_len
0,"[one, reviewers, mentioned, watching, 1, oz, e...",167
1,"[wonderful, little, production, filming, techn...",84
2,"[thought, wonderful, way, spend, time, hot, su...",88
3,"[basically, theres, family, little, boy, jake,...",71
4,"[petter, matteis, love, time, money, visually,...",128


In [None]:
data.describe()

Unnamed: 0,token_len
count,49582.0
mean,120.137268
std,90.716578
min,3.0
25%,64.0
50%,89.0
75%,146.0
max,1432.0


##2.1 Padding

In [None]:
MAX_LEN = math.ceil(data.describe().values[1])
print(MAX_LEN)

121


  MAX_LEN = math.ceil(data.describe().values[1])


In [None]:
def pad_token(series):
    if len(series) < MAX_LEN:
        series.extend(['<END>']*(MAX_LEN-len(series)))
        return series
    else:
        return series[:MAX_LEN]

data['paded_tokens'] = data['tokens'].apply(pad_token)

In [None]:
print(data['paded_tokens'].values[10])

['phil', 'alien', 'one', 'quirky', 'films', 'humour', 'based', 'around', 'oddness', 'everything', 'rather', 'actual', 'punchlinesbr', 'br', 'first', 'odd', 'pretty', 'funny', 'movie', 'progressed', 'didnt', 'find', 'jokes', 'oddness', 'funny', 'anymorebr', 'br', 'low', 'budget', 'film', 'thats', 'never', 'problem', 'pretty', 'interesting', 'characters', 'eventually', 'lost', 'interestbr', 'br', 'imagine', 'film', 'appeal', 'stoner', 'currently', 'partakingbr', 'br', 'something', 'similar', 'better', 'try', 'brother', 'another', 'planet', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>

In [None]:
data[['tokens','paded_tokens']]

Unnamed: 0,tokens,paded_tokens
0,"[one, reviewers, mentioned, watching, 1, oz, e...","[one, reviewers, mentioned, watching, 1, oz, e..."
1,"[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn..."
2,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su..."
3,"[basically, theres, family, little, boy, jake,...","[basically, theres, family, little, boy, jake,..."
4,"[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,..."
...,...,...
49995,"[thought, movie, right, good, job, wasnt, crea...","[thought, movie, right, good, job, wasnt, crea..."
49996,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,"[catholic, taught, parochial, elementary, scho...","[catholic, taught, parochial, elementary, scho..."
49998,"[im, going, disagree, previous, comment, side,...","[im, going, disagree, previous, comment, side,..."


# 3.0 Normalization

## 3.1 Lemmetization

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
import nltk

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def lemma(series):
    return [lemmatizer.lemmatize(word) for word in series]

data['lemma_tokens'] = data['paded_tokens'].apply(lemma)

In [None]:
data[['tokens','lemma_tokens']]

Unnamed: 0,tokens,lemma_tokens
0,"[one, reviewers, mentioned, watching, 1, oz, e...","[one, reviewer, mentioned, watching, 1, oz, ep..."
1,"[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn..."
2,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su..."
3,"[basically, theres, family, little, boy, jake,...","[basically, there, family, little, boy, jake, ..."
4,"[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,..."
...,...,...
49995,"[thought, movie, right, good, job, wasnt, crea...","[thought, movie, right, good, job, wasnt, crea..."
49996,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,"[catholic, taught, parochial, elementary, scho...","[catholic, taught, parochial, elementary, scho..."
49998,"[im, going, disagree, previous, comment, side,...","[im, going, disagree, previous, comment, side,..."


##3.2 Stemming

In [None]:
stemmer = PorterStemmer()

In [None]:
def stem(series):
    return [stemmer.stem(word) for word in series]

data['stem_tokens'] = data['tokens'].apply(stem)

In [None]:
data[['tokens','stem_tokens']]

Unnamed: 0,tokens,stem_tokens
0,"[one, reviewers, mentioned, watching, 1, oz, e...","[one, review, mention, watch, 1, oz, episod, y..."
1,"[wonderful, little, production, filming, techn...","[wonder, littl, product, film, techniqu, unass..."
2,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe..."
3,"[basically, theres, family, little, boy, jake,...","[basic, there, famili, littl, boy, jake, think..."
4,"[petter, matteis, love, time, money, visually,...","[petter, mattei, love, time, money, visual, st..."
...,...,...
49995,"[thought, movie, right, good, job, wasnt, crea...","[thought, movi, right, good, job, wasnt, creat..."
49996,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogu, bad, act, idiot, dir..."
49997,"[catholic, taught, parochial, elementary, scho...","[cathol, taught, parochi, elementari, school, ..."
49998,"[im, going, disagree, previous, comment, side,...","[im, go, disagre, previou, comment, side, malt..."


# 4.0 POS Tagging

In [None]:
import nltk

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
from nltk import pos_tag

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [None]:
def pos_t(series):
    return nltk.pos_tag(series, tagset='universal')

data['pos_tag_tokens'] = data['tokens'].apply(pos_t)

In [None]:
data[['tokens','pos_tag_tokens']]

Unnamed: 0,tokens,pos_tag_tokens
0,"[one, reviewers, mentioned, watching, 1, oz, e...","[(one, NUM), (reviewers, NOUN), (mentioned, VE..."
1,"[wonderful, little, production, filming, techn...","[(wonderful, ADJ), (little, ADJ), (production,..."
2,"[thought, wonderful, way, spend, time, hot, su...","[(thought, VERB), (wonderful, ADJ), (way, NOUN..."
3,"[basically, theres, family, little, boy, jake,...","[(basically, ADV), (theres, NOUN), (family, NO..."
4,"[petter, matteis, love, time, money, visually,...","[(petter, NOUN), (matteis, ADV), (love, ADJ), ..."
...,...,...
49995,"[thought, movie, right, good, job, wasnt, crea...","[(thought, VERB), (movie, NOUN), (right, ADV),..."
49996,"[bad, plot, bad, dialogue, bad, acting, idioti...","[(bad, ADJ), (plot, NOUN), (bad, ADJ), (dialog..."
49997,"[catholic, taught, parochial, elementary, scho...","[(catholic, ADJ), (taught, VERB), (parochial, ..."
49998,"[im, going, disagree, previous, comment, side,...","[(im, NOUN), (going, VERB), (disagree, ADJ), (..."


# 5.0 Word Embeddings