# Sentiment Analysis of IMDb Movie Reviews

## Importing Libraries


In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer


## Loading the Data 


* In this stage of loading the data, we have a folder called "aclImdb" that contains the IMDB dataset for sentiment analysis. The dataset is divided into two main folders: "train" and "test". Each of these folders further contains two subfolders, "positive" and "negative". The "positive" folder contains movie reviews with positive sentiment, while the "negative" folder contains reviews with negative sentiment.


In [2]:
import os
import pandas as pd

def load_train_test_imdb_data(data_dir):

    data = {}
    for split in ["train", "test"]:
        data[split] = []
        for sentiment in ["neg", "pos"]:
            score = 1 if sentiment == "pos" else 0

            path = os.path.join(data_dir, split, sentiment)
            file_names = os.listdir(path)
            for f_name in file_names:
                with open(os.path.join(path, f_name), "r", encoding="utf-8") as f:
                    review = f.read()
                    data[split].append([review, score])

    np.random.shuffle(data["train"])        
    data["train"] = pd.DataFrame(data["train"], columns=['text', 'sentiment'])
    print(data["train"])
    np.random.shuffle(data["test"])
    data["test"] = pd.DataFrame(data["test"], columns=['text', 'sentiment'])
    print(data["test"])
    return data["train"], data["test"]


In [3]:
train_data, test_data = load_train_test_imdb_data(
    data_dir="aclImdb/")

                                                    text  sentiment
0      Just saw the World Preem of Fido at the Toront...          1
1      So we compromised. This was a fairly charming ...          1
2      A wonderful surprise of the Spanish cinema. I ...          1
3      Psychotic transsexual Bobbi murders the patien...          1
4      The producers of this film offer to pay funera...          0
...                                                  ...        ...
24995  After watching this film last night on Sundanc...          1
24996  First things first, the female lead is too gor...          1
24997  I don't know what the previous reviewer was wa...          1
24998  OK the director remakes LOVE ACTUALLY The dire...          0
24999  This comic book style film is funny, has nicel...          1

[25000 rows x 2 columns]
                                                    text  sentiment
0      Like many a child born in the 1980's, I grew u...          1
1      The movie is go

In [11]:
train_data.head(40)

Unnamed: 0,text,sentiment
0,"Elia Kazan, one of the best theater directors ...",1
1,**Possible Spoilers Ahead**<br /><br />\tJason...,0
2,"For your own good, it would be best to disrega...",0
3,Well i do disagreed with the other comment pos...,1
4,"Yes, you guessed it. Another movie where ident...",0
5,"being a NI supporter, it's hard to objectively...",0
6,"There are bad movies, terrible movies even bor...",0
7,With part reconstruction and part direct shoot...,1
8,James Bishop (Matt Stasi) goes to a `mental il...,0
9,When I saw this movie in the theater when it c...,1


## Pre-Processing

In [9]:
import re
import nltk
from nltk.stem import PorterStemmer

def clean_text(text):
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    
    # stemming
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])

    return text


####



## Model Building

* To further enhance the model's performance, we apply feature selection using a Genetic Algorithm. This algorithm utilizes evolutionary principles to iteratively select the most informative features from the dataset. We employ the GeneticSelectionCV class, which integrates the Genetic Algorithm with cross-validation for optimal feature selection.

In [13]:
pip install feature_selection_genetic

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement feature_selection_genetic (from versions: none)
ERROR: No matching distribution found for feature_selection_genetic


In [14]:
from feature_selection_genetic import GeneticSelectionCV

ModuleNotFoundError: No module named 'feature_selection_genetic'

In [None]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from feature_selection_genetic import GeneticSelectionCV


# Transform each text into a vector of word counts
vectorizere = CountVectorizer(stop_words="english",
                             preprocessor=clean_text
                            )


X = vectorizere.fit_transform(data["text"])
y = data["sentiment"]



# Feature selection using Genetic Algorithm
selector = GeneticSelectionCV(LinearSVC(),
                              cv=5,
                              verbose=1,
                              scoring="accuracy",
                              n_population=50,
                              crossover_proba=0.5,
                              mutation_proba=0.2,
                              n_generations=10,
                              crossover_independent_proba=0.5,
                              mutation_independent_proba=0.05,
                              tournament_size=3,
                              caching=True,
                              n_jobs=-1)
selector.fit(X, y)


# Select the best features
X_selected = selector.transform(X)



# Training
model = LinearSVC()
model.fit(X_selected, y)


# Testing
test_features = vectorizer.transform(test_data["text"])
test_features_selected = selector.transform(test_features)
y_pred = model.predict(test_features_selected)


# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)
print("Accuracy on the IMDB dataset: {:.2f}".format(acc * 100))


## SVM- Classification 

***feature selection***


In [15]:
import re
# Transform each text into a vector of word counts
vectorizer = CountVectorizer(stop_words="english",
                             preprocessor=clean_text
                             )
training_features = vectorizer.fit_transform(train_data["text"]) 
print('Training Features' + str(training_features))
test_features = vectorizer.transform(test_data["text"])
print('Testing Features' + str(test_features))




Training Features  (0, 24626)	2
  (0, 40333)	1
  (0, 51674)	1
  (0, 36323)	1
  (0, 16586)	2
  (0, 47165)	1
  (0, 23395)	1
  (0, 16653)	5
  (0, 16510)	1
  (0, 46552)	2
  (0, 15057)	3
  (0, 50846)	1
  (0, 38736)	1
  (0, 18505)	1
  (0, 51186)	1
  (0, 46564)	1
  (0, 35505)	1
  (0, 8363)	2
  (0, 16985)	1
  (0, 15829)	1
  (0, 44750)	1
  (0, 18546)	1
  (0, 39272)	2
  (0, 46443)	4
  (0, 7660)	2
  :	:
  (24999, 26663)	1
  (24999, 26521)	1
  (24999, 51393)	1
  (24999, 34140)	1
  (24999, 10321)	1
  (24999, 42491)	1
  (24999, 10047)	1
  (24999, 41103)	1
  (24999, 44398)	1
  (24999, 43586)	1
  (24999, 43667)	1
  (24999, 31963)	1
  (24999, 3249)	2
  (24999, 40649)	1
  (24999, 9868)	1
  (24999, 25326)	1
  (24999, 4387)	1
  (24999, 37972)	1
  (24999, 13063)	1
  (24999, 34830)	1
  (24999, 43807)	1
  (24999, 42567)	1
  (24999, 44744)	1
  (24999, 5450)	1
  (24999, 43297)	1
Testing Features  (0, 402)	1
  (0, 882)	1
  (0, 1161)	1
  (0, 1331)	1
  (0, 1503)	1
  (0, 1518)	1
  (0, 2238)	1
  (0, 2627)	1
  (0, 3

###  
****Training****
* After performing feature selection, we train a Linear Support Vector Machine (SVM) model using the selected features. SVMs are known for their ability to handle high-dimensional data effectively.

In [16]:
import re

# Training

model = LinearSVC()
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)
print(y_pred)




[1 1 1 ... 0 0 0]


###  
***Evaluation***
* Because the IMDB dataset is balanced, we can evaluate our model using the accuracy score

In [17]:

acc = accuracy_score(test_data["sentiment"], y_pred)
print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))

Accuracy on the IMDB dataset: 82.60


* As you can see, we were able to reach as high as an 83.67% accuracy on the IMDB dataset.

## KNN- Classification 


In [18]:
import re
from sklearn.neighbors import KNeighborsClassifier

# Transform each text into a vector of word counts
vectorizer = CountVectorizer(stop_words="english",
                             preprocessor=clean_text
                             )
training_features = vectorizer.fit_transform(train_data["text"]) 
print('Training Features:', training_features)
test_features = vectorizer.transform(test_data["text"])
print('Testing Features:', test_features)






Training Features:   (0, 24626)	2
  (0, 40333)	1
  (0, 51674)	1
  (0, 36323)	1
  (0, 16586)	2
  (0, 47165)	1
  (0, 23395)	1
  (0, 16653)	5
  (0, 16510)	1
  (0, 46552)	2
  (0, 15057)	3
  (0, 50846)	1
  (0, 38736)	1
  (0, 18505)	1
  (0, 51186)	1
  (0, 46564)	1
  (0, 35505)	1
  (0, 8363)	2
  (0, 16985)	1
  (0, 15829)	1
  (0, 44750)	1
  (0, 18546)	1
  (0, 39272)	2
  (0, 46443)	4
  (0, 7660)	2
  :	:
  (24999, 26663)	1
  (24999, 26521)	1
  (24999, 51393)	1
  (24999, 34140)	1
  (24999, 10321)	1
  (24999, 42491)	1
  (24999, 10047)	1
  (24999, 41103)	1
  (24999, 44398)	1
  (24999, 43586)	1
  (24999, 43667)	1
  (24999, 31963)	1
  (24999, 3249)	2
  (24999, 40649)	1
  (24999, 9868)	1
  (24999, 25326)	1
  (24999, 4387)	1
  (24999, 37972)	1
  (24999, 13063)	1
  (24999, 34830)	1
  (24999, 43807)	1
  (24999, 42567)	1
  (24999, 44744)	1
  (24999, 5450)	1
  (24999, 43297)	1
Testing Features:   (0, 402)	1
  (0, 882)	1
  (0, 1161)	1
  (0, 1331)	1
  (0, 1503)	1
  (0, 1518)	1
  (0, 2238)	1
  (0, 2627)	1
  (

In [19]:

model = KNeighborsClassifier()
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)
print('Predicted Sentiments:', y_pred)



Predicted Sentiments: [1 1 1 ... 0 1 0]


In [20]:

acc = accuracy_score(test_data["sentiment"], y_pred)
print("Accuracy on the IMDB dataset: {:.2f}".format(acc * 100))


Accuracy on the IMDB dataset: 63.48


* with this model we were able to reach as high as an 63.48% accuracy on the IMDB dataset.

## Improving the Current Model

In [21]:
import re

# Transform each text into a vector of word counts
vectorizer = TfidfVectorizer(stop_words="english",
                             preprocessor=clean_text,
                             ngram_range=(1, 2))
training_features = vectorizer.fit_transform(train_data["text"]) 
print('Training Features' + str(training_features))
test_features = vectorizer.transform(test_data["text"])
print('Testing Features' + str(test_features))



model = LinearSVC()
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)
print(y_pred)


acc = accuracy_score(test_data["sentiment"], y_pred)
print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))



Training Features  (0, 645242)	0.06732247365603838
  (0, 422494)	0.07004452730214439
  (0, 944148)	0.06732247365603838
  (0, 405678)	0.0616342154456931
  (0, 191199)	0.04677762008373133
  (0, 217168)	0.06539114585846177
  (0, 666343)	0.07004452730214439
  (0, 550428)	0.06732247365603838
  (0, 198811)	0.07004452730214439
  (0, 789336)	0.07004452730214439
  (0, 1341147)	0.03957933010875477
  (0, 1416275)	0.06539114585846177
  (0, 2295)	0.07004452730214439
  (0, 993186)	0.04637062212124525
  (0, 1073266)	0.05102400356492786
  (0, 956672)	0.07004452730214439
  (0, 488849)	0.07004452730214439
  (0, 779648)	0.0616342154456931
  (0, 1341127)	0.03972449058206764
  (0, 822689)	0.02840436127543072
  (0, 1466467)	0.041968708664902175
  (0, 797951)	0.04709992567185625
  (0, 565694)	0.053648049480162684
  (0, 1165650)	0.07004452730214439
  (0, 628829)	0.07004452730214439
  :	:
  (24999, 1205922)	0.05677013565919056
  (24999, 635911)	0.045113965229896295
  (24999, 26469)	0.037325801907366146
  (2499

* We achieve an even higher accuracy score of 88.66% which is another 2% improvement over the last version of the model.