In [33]:
import pandas as pd
import numpy as np
import nltk
import pickle

In [2]:
df=pd.read_csv("/content/drive/MyDrive/Data science Datasets/IMDB Dataset.csv")

In [3]:
df.shape

(50000, 2)

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


#Process the text data

Punctuation Removal:


In [6]:
df['review_processed'] = df['review'].str.replace("[^a-zA-Z0-9]", " ", regex=True)

In [7]:
df[['review_processed', 'review']]

Unnamed: 0,review_processed,review
0,One of the other reviewers has mentioned that ...,One of the other reviewers has mentioned that ...
1,A wonderful little production br br The...,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...,I thought this was a wonderful way to spend ti...
3,Basically there s a family where a little boy ...,Basically there's a family where a little boy ...
4,Petter Mattei s Love in the Time of Money is...,"Petter Mattei's ""Love in the Time of Money"" is..."
...,...,...
49995,I thought this movie did a down right good job...,I thought this movie did a down right good job...
49996,Bad plot bad dialogue bad acting idiotic di...,"Bad plot, bad dialogue, bad acting, idiotic di..."
49997,I am a Catholic taught in parochial elementary...,I am a Catholic taught in parochial elementary...
49998,I m going to have to disagree with the previou...,I'm going to have to disagree with the previou...


Lower case

In [8]:
df['review_processed'] = df['review_processed'].str.lower()

Stripping extra cases

In [9]:
df['review_processed'] = df['review_processed'].str.replace("  ", " ")
df['review_processed']

Unnamed: 0,review_processed
0,one of the other reviewers has mentioned that ...
1,a wonderful little production br br the fil...
2,i thought this was a wonderful way to spend ti...
3,basically there s a family where a little boy ...
4,petter mattei s love in the time of money is a...
...,...
49995,i thought this movie did a down right good job...
49996,bad plot bad dialogue bad acting idiotic direc...
49997,i am a catholic taught in parochial elementary...
49998,i m going to have to disagree with the previou...


StopWord Removal

In [10]:
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize

stop_words = stopwords.words('english') # extracting all the stop words in english language and storing it in a variable called stop_words -> set
stop_words

# # Making custom list of words to be removed
add_words = ['movie','br','go','film','ugh','one','make','even','see','movies','get','makes','making','time','watch','character', 'would','really', 'show', 'look']

# # Adding to the list of words
stop_words.extend(add_words)

# # Function to remove stop words
def remove_stopwords(row):
    review_tokenized = word_tokenize(row)
    rev_new = " ".join([word for word in review_tokenized if word not in stop_words])
    return rev_new

# # Removing stopwords
df['review_processed'] = [remove_stopwords(r) for r in df['review_processed']]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
df['review_processed']

Unnamed: 0,review_processed
0,reviewers mentioned watching 1 oz episode hook...
1,wonderful little production filming technique ...
2,thought wonderful way spend hot summer weekend...
3,basically family little boy jake thinks zombie...
4,petter mattei love money visually stunning mr ...
...,...
49995,thought right good job creative original first...
49996,bad plot bad dialogue bad acting idiotic direc...
49997,catholic taught parochial elementary schools n...
49998,going disagree previous comment side maltin se...


Lemmatisation

In [12]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# function to convert nltk tag to wordnet tag
lemmatizer = WordNetLemmatizer()

# Finds the part of speech tag
# Convert the detailed POS tag into a shallow information
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# lemmatize sentence using pos tag
def lemmatize_sentence(sentence):
  # word tokenize -> pos tag (detailed) -> wordnet tag (shallow pos) -> lemmatizer -> root word
    #tokenize the sentence and find the POS tag for each token
    tokenised_words = nltk.word_tokenize(sentence)
    nltk_tagged = nltk.pos_tag(tokenised_words)  # output will be a list of tuples -> [(word,detailed_tag)]
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged) # output -> [(word,shallow_tag)]
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

df['review_processed'] = df['review_processed'].apply(lambda x: lemmatize_sentence(x))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [13]:
sentence = "batch is awesome going"
tokenised_words = word_tokenize(sentence) # list of words
nltk_tags = nltk.pos_tag(tokenised_words)
wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tags) # output -> [(word,shallow_tag)]
for word, tag in wordnet_tagged:
  if tag is None:
    continue
  print(word, tag)
  print(lemmatizer.lemmatize(word, pos = tag))
  print()

batch n
batch

is v
be

awesome a
awesome

going v
go



In [14]:
df['review_processed']

Unnamed: 0,review_processed
0,reviewer mention watch 1 oz episode hook right...
1,wonderful little production film technique una...
2,think wonderful way spend hot summer weekend s...
3,basically family little boy jake think zombie ...
4,petter mattei love money visually stun mr matt...
...,...
49995,thought right good job creative original first...
49996,bad plot bad dialogue bad act idiotic direct a...
49997,catholic teach parochial elementary school nun...
49998,go disagree previous comment side maltin secon...


#Bag of words - Encoding

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Creating matrix of top 2500 tokens
tfidf = TfidfVectorizer(max_features=3500)

X = tfidf.fit_transform(df.review_processed).toarray()
y = df.sentiment.map({'positive' : 1, 'negative' : 0}).values

featureNames = tfidf.get_feature_names_out()

X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Split and Train

In [43]:
with open ("/content/drive/MyDrive/Data science Datasets/Nlp_tfidf.pkl", "wb") as file:
  pickle.dump(tfidf, file)

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)
X_train.shape, X_test.shape

((37500, 3500), (12500, 3500))

#Scaling - Not mandatory

Model

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

logistic_model.score(X_test, y_test)

f1_score(logistic_model.predict(X_test), y_test)

0.8860276585598474

In [None]:
result=logistic_model.predict(X_test)

In [40]:
with open ("/content/drive/MyDrive/Data science Datasets/Nlp_logistic_model.pkl", "wb") as file:
  pickle.dump(logistic_model, file)

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score,classification_report

for depth in range(1,11):
  model = DecisionTreeClassifier(max_depth=depth)
  model.fit(X_train, y_train)
  cv_score = np.mean(cross_val_score(model, X_train, y_train, cv = 5))
  y_pred=model.predict(X_test)
  print(f"Depth : {depth}, Cross-Val Score: {cv_score},F1_score : {f1_score(y_test,y_pred)},Accuracy_score : {accuracy_score(y_test,y_pred)}")

Depth : 1, Cross-Val Score: 0.6541066666666666,F1_score : 0.7144624601159081,Accuracy_score : 0.6492
Depth : 2, Cross-Val Score: 0.6844,F1_score : 0.728762339668256,Accuracy_score : 0.67688
Depth : 3, Cross-Val Score: 0.6839733333333333,F1_score : 0.7287976382179281,Accuracy_score : 0.67664
Depth : 4, Cross-Val Score: 0.6995733333333334,F1_score : 0.7365989537332699,Accuracy_score : 0.68984
Depth : 5, Cross-Val Score: 0.7113866666666666,F1_score : 0.7449043478260869,Accuracy_score : 0.70664
Depth : 6, Cross-Val Score: 0.7189066666666666,F1_score : 0.7446931407942238,Accuracy_score : 0.71712
Depth : 7, Cross-Val Score: 0.7270933333333334,F1_score : 0.7519003836965178,Accuracy_score : 0.72584
Depth : 8, Cross-Val Score: 0.7355733333333332,F1_score : 0.7570637926012235,Accuracy_score : 0.73312
Depth : 9, Cross-Val Score: 0.7406666666666666,F1_score : 0.7597173144876325,Accuracy_score : 0.73888
Depth : 10, Cross-Val Score: 0.7400533333333333,F1_score : 0.7612071491356578,Accuracy_score : 0

In [41]:
#Best model
Decision_model = DecisionTreeClassifier(max_depth=9)
Decision_model.fit(X_train, y_train)
cv_score = np.mean(cross_val_score(Decision_model, X_train, y_train, cv = 5))
y_pred=Decision_model.predict(X_test)
print(f"Depth : {9}")
print("")
print(f"Cross-Val Score: {cv_score:.2f},F1_score : {f1_score(y_test,y_pred):.2f},Accuracy_score : {accuracy_score(y_test,y_pred):.2f}")
print("")
print(f"classification_report:\n {classification_report(y_test,y_pred)}")
print("")
print(f"confusion_matrix :\n {confusion_matrix(y_test,y_pred)}")

Depth : 9

Cross-Val Score: 0.74,F1_score : 0.76,Accuracy_score : 0.74

classification_report:
               precision    recall  f1-score   support

           0       0.80      0.65      0.71      6302
           1       0.70      0.83      0.76      6198

    accuracy                           0.74     12500
   macro avg       0.75      0.74      0.74     12500
weighted avg       0.75      0.74      0.74     12500


confusion_matrix :
 [[4071 2231]
 [1038 5160]]


In [42]:
with open ("/content/drive/MyDrive/Data science Datasets/Nlp_decision_tree_model.pkl", "wb") as file:
  pickle.dump(Decision_model, file)

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score,classification_report

for estimator in range(5, 11):
  Random_model = RandomForestClassifier(n_estimators=estimator)
  Random_model.fit(X_train, y_train)
  cv_score = np.mean(cross_val_score(model, X_train, y_train, cv = 5))
  y_pred=model.predict(X_test)
  print(f"n_estimators : {estimator}")
  print(f"Cross-Val Score: {cv_score:.4f},F1_score : {f1_score(y_test,y_pred):.4f},Accuracy_score : {accuracy_score(y_test,y_pred):.4f}")


n_estimators : 5
Cross-Val Score: 0.7405,F1_score : 0.7595,Accuracy_score : 0.7384
n_estimators : 6
Cross-Val Score: 0.7401,F1_score : 0.7595,Accuracy_score : 0.7384
n_estimators : 7
Cross-Val Score: 0.7409,F1_score : 0.7595,Accuracy_score : 0.7384
n_estimators : 8
Cross-Val Score: 0.7407,F1_score : 0.7595,Accuracy_score : 0.7384
n_estimators : 9
Cross-Val Score: 0.7404,F1_score : 0.7595,Accuracy_score : 0.7384
n_estimators : 10
Cross-Val Score: 0.7407,F1_score : 0.7595,Accuracy_score : 0.7384


In [37]:
Random_model = RandomForestClassifier(n_estimators=7)
Random_model.fit(X_train, y_train)
cv_score = np.mean(cross_val_score(model, X_train, y_train, cv = 5))
y_pred=model.predict(X_test)
print(f"n_estimators : {7}, Cross-Val Score: {cv_score:.4f},F1_score : {f1_score(y_test,y_pred):.4f},Accuracy_score : {accuracy_score(y_test,y_pred):.4f}")
print(f"classification_report:\n {classification_report(y_test,y_pred)}")
print(f"confusion_matrix :\n {confusion_matrix(y_test,y_pred)}")

n_estimators : 7, Cross-Val Score: 0.7405,F1_score : 0.7595,Accuracy_score : 0.7384
classification_report:
               precision    recall  f1-score   support

           0       0.80      0.65      0.71      6302
           1       0.70      0.83      0.76      6198

    accuracy                           0.74     12500
   macro avg       0.75      0.74      0.74     12500
weighted avg       0.75      0.74      0.74     12500

confusion_matrix :
 [[4068 2234]
 [1036 5162]]


In [38]:
with open ("/content/drive/MyDrive/Data science Datasets/Nlp_random_forest_model.pkl", "wb") as file:
  pickle.dump(Random_model, file)