# Sentiment Analysis on Movie Reviews

## (1) Data Collection:

In [1]:
import os

base_dir = 'aclImdb/train/'

sub_folders = os.listdir(base_dir)

original_directories = []

for folder in sub_folders:
    folder_path = os.path.join(base_dir, folder)
    if os.path.isdir(folder_path):
        all_files_and_dirs = os.listdir(folder_path)
        only_files = [f for f in all_files_and_dirs if os.path.isfile(os.path.join(folder_path, f))]
        original_directories.append(folder_path)
        print(folder_path, 'has', len(only_files), 'entries')        

aclImdb/train/neg has 12500 entries
aclImdb/train/unsup has 50000 entries
aclImdb/train/pos has 12500 entries


In [2]:
#we will use this list directoies to access and preprocess data
original_directories

['aclImdb/train/neg', 'aclImdb/train/unsup', 'aclImdb/train/pos']

## (2) Data Preprocessing:

In [3]:
#example review
sentence= "Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."

### (a) Removing Punctuations and Special Characters

In [4]:
import re
import string

def preprocess_clean(text):
    
    # removing HTML Tags
    htmltags = '<.*?>'
    text_without_html = re.sub(htmltags, '', text)
    
    # removing Punctuation
    punctuations = string.punctuation
    text_without_punct = re.sub(f"[{punctuations}]", '', text_without_html)
    
    #converting texts to lower case
    final = text_without_punct.lower()
    
    return final


# sample of function usage
sentence = preprocess_clean(sentence)
print("Cleaned Text:", sentence)

Cleaned Text: story of a man who has unnatural feelings for a pig starts out with a opening scene that is a terrific example of absurd comedy a formal orchestra audience is turned into an insane violent mob by the crazy chantings of its singers unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting even those from the era should be turned off the cryptic dialogue would make shakespeare seem easy to a third grader on a technical level its better than you might think with some good cinematography by future great vilmos zsigmond future stars sally kirkland and frederic forrest can be seen briefly


### (b) Converting the Sentences into tokens

In [5]:
from nltk.tokenize import word_tokenize

def tokenize_text(cleaned_text):
    # Tokenize the sentence
    tokenized_words = word_tokenize(cleaned_text)
    
    return tokenized_words

# sample of function usage
sentence = tokenize_text(sentence)
print("Tokenized Words from the Sentence are:", sentence)

Tokenized Words from the Sentence are: ['story', 'of', 'a', 'man', 'who', 'has', 'unnatural', 'feelings', 'for', 'a', 'pig', 'starts', 'out', 'with', 'a', 'opening', 'scene', 'that', 'is', 'a', 'terrific', 'example', 'of', 'absurd', 'comedy', 'a', 'formal', 'orchestra', 'audience', 'is', 'turned', 'into', 'an', 'insane', 'violent', 'mob', 'by', 'the', 'crazy', 'chantings', 'of', 'its', 'singers', 'unfortunately', 'it', 'stays', 'absurd', 'the', 'whole', 'time', 'with', 'no', 'general', 'narrative', 'eventually', 'making', 'it', 'just', 'too', 'off', 'putting', 'even', 'those', 'from', 'the', 'era', 'should', 'be', 'turned', 'off', 'the', 'cryptic', 'dialogue', 'would', 'make', 'shakespeare', 'seem', 'easy', 'to', 'a', 'third', 'grader', 'on', 'a', 'technical', 'level', 'its', 'better', 'than', 'you', 'might', 'think', 'with', 'some', 'good', 'cinematography', 'by', 'future', 'great', 'vilmos', 'zsigmond', 'future', 'stars', 'sally', 'kirkland', 'and', 'frederic', 'forrest', 'can', 'be'

### (c) Removing the stopwords

In [6]:
from nltk.corpus import stopwords

# Initialize the set of stopwords
stop_words = set(stopwords.words('english'))

#in order to modify your stop_words set, you can append/edit this list
"""stop_words.append('new words')"""

def remove_stopwords(text):
    # initialization of return variable
    text_without_stopwords = []
    
    #iterating thru each word
    for word in text:
        if word.lower() not in stop_words:
            text_without_stopwords.append(word)
    
    return text_without_stopwords

# sample of function usage
sentence = remove_stopwords(sentence)
print("Words without Stopwords:", sentence)

Words without Stopwords: ['story', 'man', 'unnatural', 'feelings', 'pig', 'starts', 'opening', 'scene', 'terrific', 'example', 'absurd', 'comedy', 'formal', 'orchestra', 'audience', 'turned', 'insane', 'violent', 'mob', 'crazy', 'chantings', 'singers', 'unfortunately', 'stays', 'absurd', 'whole', 'time', 'general', 'narrative', 'eventually', 'making', 'putting', 'even', 'era', 'turned', 'cryptic', 'dialogue', 'would', 'make', 'shakespeare', 'seem', 'easy', 'third', 'grader', 'technical', 'level', 'better', 'might', 'think', 'good', 'cinematography', 'future', 'great', 'vilmos', 'zsigmond', 'future', 'stars', 'sally', 'kirkland', 'frederic', 'forrest', 'seen', 'briefly']


### (d -1 ) Stemming of the words (this is the tricky part) 
[ for the sake of accuracy, i avoid this to make sure our dataset stays as rich as possible ]

### (d -2 ) Lemmatization of the words
[lemmatization is better than stemming as it is dictionary based and this process saves the semantics of words]

In [7]:
from nltk.stem import WordNetLemmatizer

def apply_lemmatization(stemmed_words):
    # initialization for return 
    lemmatized_words = []
    
    # applying lemmatization
    for word in stemmed_words:
        lemmatized_word = WordNetLemmatizer().lemmatize(word)
        lemmatized_words.append(lemmatized_word)
    
    return lemmatized_words

# sample of function usage
sentence = apply_lemmatization(sentence)
print("Lemmatized Words:", sentence)

Lemmatized Words: ['story', 'man', 'unnatural', 'feeling', 'pig', 'start', 'opening', 'scene', 'terrific', 'example', 'absurd', 'comedy', 'formal', 'orchestra', 'audience', 'turned', 'insane', 'violent', 'mob', 'crazy', 'chanting', 'singer', 'unfortunately', 'stay', 'absurd', 'whole', 'time', 'general', 'narrative', 'eventually', 'making', 'putting', 'even', 'era', 'turned', 'cryptic', 'dialogue', 'would', 'make', 'shakespeare', 'seem', 'easy', 'third', 'grader', 'technical', 'level', 'better', 'might', 'think', 'good', 'cinematography', 'future', 'great', 'vilmos', 'zsigmond', 'future', 'star', 'sally', 'kirkland', 'frederic', 'forrest', 'seen', 'briefly']


### (E) Final Step to perform all on complete dataset

In [8]:
def preprocess_text(text):
    text = preprocess_clean(text)
    tokens = tokenize_text(text)
    cleaned_tokens = remove_stopwords(tokens)
    return cleaned_tokens

In [9]:
import os


# Define the new directories for preprocessed text
new_directories = [
    'aclImdb/preprocessed_train/neg',
    'aclImdb/preprocessed_train/unsup',
    'aclImdb/preprocessed_train/pos'
]

# Create new directories
for new_directory in new_directories:
    os.makedirs(new_directory, exist_ok=True)


In [10]:
# # Loop through each original directory
# for original_dir, new_dir in zip(original_directories, new_directories):
#     for filename in os.listdir(original_dir):
#         if filename.endswith('.txt'):
#             with open(os.path.join(original_dir, filename), 'r', encoding='utf-8') as f:
#                 text = f.read()
#                 processed_text = preprocess_text(text)
                
#                 # Save the processed text into the new directory
#                 with open(os.path.join(new_dir, filename), 'w', encoding='utf-8') as f_out:
#                     f_out.write(' '.join(processed_text))


## (3) Feature Extraction:

In [11]:
new_directories

['aclImdb/preprocessed_train/neg',
 'aclImdb/preprocessed_train/unsup',
 'aclImdb/preprocessed_train/pos']

In [12]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [13]:
import tensorflow as tf
import tensorflow_hub as hub
import os

ModuleNotFoundError: No module named 'tensorflow_hub'