In [170]:
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import nltk
import string
import re
import seaborn as sns
import matplotlib.pyplot as plt

!pip install transformers

import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

## Load data

In [None]:
df=pd.read_csv("IMDB Dataset.csv")

**Check the head of df**

In [132]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [133]:
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [134]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

## Data Split


In [135]:
X=df.drop('sentiment',axis='columns')
y=df['sentiment']

#### Train Test Validation Split

In [136]:
df_pos = df[df['sentiment']=='positive']
df_neg = df[df['sentiment']=='negative']

Xp = df_pos.drop(columns='sentiment')
yp = df_pos['sentiment']
Xn = df_neg.drop(columns='sentiment')
yn = df_neg['sentiment']

In [137]:
Xp_train, Xp_test, yp_train, yp_test = train_test_split(Xp, yp, test_size=0.2, random_state=123)
Xp_train, Xp_val, yp_train, yp_val = train_test_split(Xp_train, yp_train, test_size=0.125, random_state=123)

Xn_train, Xn_test, yn_train, yn_test = train_test_split(Xn, yn, test_size=0.2, random_state=123)
Xn_train, Xn_val, yn_train, yn_val = train_test_split(Xn_train, yn_train, test_size=0.125, random_state=123)

X_train = pd.concat([Xp_train, Xn_train])
y_train = pd.concat([yp_train, yn_train])
x_val = pd.concat([Xp_val, Xn_val])
y_val = pd.concat([yp_val, yn_val])
X_test = pd.concat([Xp_test, Xn_test])
y_test = pd.concat([yp_test, yn_test])

print(y_train.value_counts())
print(y_val.value_counts())
print(y_test.value_counts())

print(x_val)

positive    17500
negative    17500
Name: sentiment, dtype: int64
positive    2500
negative    2500
Name: sentiment, dtype: int64
positive    5000
negative    5000
Name: sentiment, dtype: int64
                                                  review
22894  Anna (Ursula Andress) is brought in as an offi...
18574  THE SEA INSIDE a film by Alejandro Amenabar.<b...
4777   "Nuovomondo" was a great experience. Many film...
4056   It was hard for me to believe all of the negat...
21054  endearing tale........ voted ten against all a...
...                                                  ...
27591  Ron Howard and his "editors" only had one job ...
19521  This is a pale imitation of 'Officer and a Gen...
38950  All I can really say is that I'm glad that I w...
49564  They say David Duchovny took six days to write...
22069  i'm ask... what a f*** are whit the real-TV ne...

[5000 rows x 1 columns]


## Pre-processing

In [138]:
print(X_train['review'])

26383    I own this movie. I've seen it over 20 times a...
15271    Good movie, all elements of a good movie was t...
47179    This is a gem, a real piece of Americana for a...
2643     A group of teens that have broken into a huge ...
15601    This was a highly original decent movie, and a...
                               ...                        
21863    This film is bad, yes, but had the producers u...
31280    Look,I'm reading and reading this comments and...
4963     First, the current IMDb plot description seems...
18631    Movies like this make me wonder what modern ho...
47455    I do not believe all the praise for this movie...
Name: review, Length: 35000, dtype: object


#### Lowercasing

In [140]:
X_train.review = X_train.review.str.lower()
X_train.review

26383    i own this movie. i've seen it over 20 times a...
15271    good movie, all elements of a good movie was t...
47179    this is a gem, a real piece of americana for a...
2643     a group of teens that have broken into a huge ...
15601    this was a highly original decent movie, and a...
                               ...                        
21863    this film is bad, yes, but had the producers u...
31280    look,i'm reading and reading this comments and...
4963     first, the current imdb plot description seems...
18631    movies like this make me wonder what modern ho...
47455    i do not believe all the praise for this movie...
Name: review, Length: 35000, dtype: object

#### Tokenizing

In [107]:
[nltk.sent_tokenize(item) for item in X_train.review]
tokens = [nltk.word_tokenize(item) for item in X_train.review]
tokens

[['it',
  'would',
  'be',
  'something',
  'to',
  'try',
  'and',
  'tell',
  'someone',
  'what',
  'fata',
  'morgana',
  'is',
  'very',
  'simply',
  'about',
  '.',
  'or',
  ',',
  'maybe',
  'it',
  'is',
  "n't",
  ':',
  'herzog',
  'goes',
  'to',
  'the',
  'sahara',
  'desert',
  'and',
  'nearby',
  'villages',
  'to',
  'film',
  'assorted',
  'landscapes',
  'and',
  'the',
  'locals',
  '.',
  'but',
  'this',
  'is',
  'just',
  'the',
  'broadest',
  'stroke',
  '.',
  'it',
  "'s",
  'a',
  'feat',
  'that',
  'you',
  'either',
  'surrender',
  'yourself',
  'to',
  ',',
  'or',
  'you',
  'do',
  "n't",
  '.',
  'he',
  'gets',
  'into',
  'the',
  'form',
  'of',
  'the',
  'world',
  'around',
  'him',
  'entirely',
  ',',
  'without',
  'a',
  'story',
  ',',
  'bound',
  'only',
  'to',
  'certain',
  'aspects',
  'of',
  'written',
  'poetry',
  ',',
  'as',
  'his',
  'camera',
  '(',
  'shooting',
  'on',
  'supposedly',
  'discarded',
  'film',
  'stock',

#### Removing Punctuation

In [108]:
regex = re.compile(f'[{re.escape(string.punctuation)}]')
res=[regex.sub('', word) for words in tokens for word in words if not regex.sub('', word) == '']
res

['it',
 'would',
 'be',
 'something',
 'to',
 'try',
 'and',
 'tell',
 'someone',
 'what',
 'fata',
 'morgana',
 'is',
 'very',
 'simply',
 'about',
 'or',
 'maybe',
 'it',
 'is',
 'nt',
 'herzog',
 'goes',
 'to',
 'the',
 'sahara',
 'desert',
 'and',
 'nearby',
 'villages',
 'to',
 'film',
 'assorted',
 'landscapes',
 'and',
 'the',
 'locals',
 'but',
 'this',
 'is',
 'just',
 'the',
 'broadest',
 'stroke',
 'it',
 's',
 'a',
 'feat',
 'that',
 'you',
 'either',
 'surrender',
 'yourself',
 'to',
 'or',
 'you',
 'do',
 'nt',
 'he',
 'gets',
 'into',
 'the',
 'form',
 'of',
 'the',
 'world',
 'around',
 'him',
 'entirely',
 'without',
 'a',
 'story',
 'bound',
 'only',
 'to',
 'certain',
 'aspects',
 'of',
 'written',
 'poetry',
 'as',
 'his',
 'camera',
 'shooting',
 'on',
 'supposedly',
 'discarded',
 'film',
 'stock',
 'wanders',
 'like',
 'in',
 'a',
 'pure',
 'travelogue',
 'one',
 'might',
 'even',
 'jump',
 'to',
 'that',
 'easy',
 'conclusion',
 'as',
 'he',
 'puts',
 'up',
 'th

#### Removing Stop Words

In [109]:
stop_words = stopwords.words('english')
stop_words.append('via')
words = [token for token in res if token not in stop_words]
words

['would',
 'something',
 'try',
 'tell',
 'someone',
 'fata',
 'morgana',
 'simply',
 'maybe',
 'nt',
 'herzog',
 'goes',
 'sahara',
 'desert',
 'nearby',
 'villages',
 'film',
 'assorted',
 'landscapes',
 'locals',
 'broadest',
 'stroke',
 'feat',
 'either',
 'surrender',
 'nt',
 'gets',
 'form',
 'world',
 'around',
 'entirely',
 'without',
 'story',
 'bound',
 'certain',
 'aspects',
 'written',
 'poetry',
 'camera',
 'shooting',
 'supposedly',
 'discarded',
 'film',
 'stock',
 'wanders',
 'like',
 'pure',
 'travelogue',
 'one',
 'might',
 'even',
 'jump',
 'easy',
 'conclusion',
 'puts',
 'immense',
 'landscapes',
 'moving',
 'rough',
 'civilized',
 'culture',
 'though',
 'actual',
 'normal',
 'culture',
 'point',
 'levels',
 'abstract',
 'able',
 'convey',
 'properly',
 'sometimes',
 'takes',
 'get',
 'along',
 'close',
 'purity',
 'creation',
 'section',
 'purity',
 'parts',
 'manipulated',
 'either',
 'nature',
 'brokendown',
 'machines',
 'soon',
 'narration',
 'readings',
 'pop

#### Removing Links

In [110]:
regex = re.compile('http\S+')
tokens_without_links = [regex.sub('', word) for word in words if not regex.sub('', word) == '' and not word.startswith('tc')]
tokens_without_links

['would',
 'something',
 'try',
 'tell',
 'someone',
 'fata',
 'morgana',
 'simply',
 'maybe',
 'nt',
 'herzog',
 'goes',
 'sahara',
 'desert',
 'nearby',
 'villages',
 'film',
 'assorted',
 'landscapes',
 'locals',
 'broadest',
 'stroke',
 'feat',
 'either',
 'surrender',
 'nt',
 'gets',
 'form',
 'world',
 'around',
 'entirely',
 'without',
 'story',
 'bound',
 'certain',
 'aspects',
 'written',
 'poetry',
 'camera',
 'shooting',
 'supposedly',
 'discarded',
 'film',
 'stock',
 'wanders',
 'like',
 'pure',
 'travelogue',
 'one',
 'might',
 'even',
 'jump',
 'easy',
 'conclusion',
 'puts',
 'immense',
 'landscapes',
 'moving',
 'rough',
 'civilized',
 'culture',
 'though',
 'actual',
 'normal',
 'culture',
 'point',
 'levels',
 'abstract',
 'able',
 'convey',
 'properly',
 'sometimes',
 'takes',
 'get',
 'along',
 'close',
 'purity',
 'creation',
 'section',
 'purity',
 'parts',
 'manipulated',
 'either',
 'nature',
 'brokendown',
 'machines',
 'soon',
 'narration',
 'readings',
 'pop

#### Stemming

In [111]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens_without_links]
stemmed_words

['would',
 'someth',
 'tri',
 'tell',
 'someon',
 'fata',
 'morgana',
 'simpli',
 'mayb',
 'nt',
 'herzog',
 'goe',
 'sahara',
 'desert',
 'nearbi',
 'villag',
 'film',
 'assort',
 'landscap',
 'local',
 'broadest',
 'stroke',
 'feat',
 'either',
 'surrend',
 'nt',
 'get',
 'form',
 'world',
 'around',
 'entir',
 'without',
 'stori',
 'bound',
 'certain',
 'aspect',
 'written',
 'poetri',
 'camera',
 'shoot',
 'supposedli',
 'discard',
 'film',
 'stock',
 'wander',
 'like',
 'pure',
 'travelogu',
 'one',
 'might',
 'even',
 'jump',
 'easi',
 'conclus',
 'put',
 'immens',
 'landscap',
 'move',
 'rough',
 'civil',
 'cultur',
 'though',
 'actual',
 'normal',
 'cultur',
 'point',
 'level',
 'abstract',
 'abl',
 'convey',
 'properli',
 'sometim',
 'take',
 'get',
 'along',
 'close',
 'puriti',
 'creation',
 'section',
 'puriti',
 'part',
 'manipul',
 'either',
 'natur',
 'brokendown',
 'machin',
 'soon',
 'narrat',
 'read',
 'popol',
 'vuh',
 'way',
 'music',
 'film',
 'gradual',
 'process'

#### Lemmatizing

In [112]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
lemmatized_words

['would',
 'someth',
 'tri',
 'tell',
 'someon',
 'fata',
 'morgana',
 'simpli',
 'mayb',
 'nt',
 'herzog',
 'goe',
 'sahara',
 'desert',
 'nearbi',
 'villag',
 'film',
 'assort',
 'landscap',
 'local',
 'broadest',
 'stroke',
 'feat',
 'either',
 'surrend',
 'nt',
 'get',
 'form',
 'world',
 'around',
 'entir',
 'without',
 'stori',
 'bound',
 'certain',
 'aspect',
 'written',
 'poetri',
 'camera',
 'shoot',
 'supposedli',
 'discard',
 'film',
 'stock',
 'wander',
 'like',
 'pure',
 'travelogu',
 'one',
 'might',
 'even',
 'jump',
 'easi',
 'conclus',
 'put',
 'immens',
 'landscap',
 'move',
 'rough',
 'civil',
 'cultur',
 'though',
 'actual',
 'normal',
 'cultur',
 'point',
 'level',
 'abstract',
 'abl',
 'convey',
 'properli',
 'sometim',
 'take',
 'get',
 'along',
 'close',
 'puriti',
 'creation',
 'section',
 'puriti',
 'part',
 'manipul',
 'either',
 'natur',
 'brokendown',
 'machin',
 'soon',
 'narrat',
 'read',
 'popol',
 'vuh',
 'way',
 'music',
 'film',
 'gradual',
 'process'

## Classification Using BERT

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)