In [1]:
import numpy as np
import pandas as pd


In [2]:
#load dataset 
df = pd.read_csv('D:\IMDB Dataset\IMDB Dataset.csv')
print("Dataset Loaded")

Dataset Loaded


# Initial Exploration

1. **Shape of the Dataset:**
   - We will start by examining the dimensions of the dataset to understand its size.

2. **Null Values:**
   - Check for any missing or null values in the dataset and decide how to handle them.

3. **Data Types:**
   - Review the data types of each column to ensure they are appropriate for analysis.

4. **Statistical Exploration:**
   - Performing statistical analysis on the dataset like mean, median, range, etc.


In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape


(50000, 2)

In [5]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
df.dtypes

review       object
sentiment    object
dtype: object

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [8]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


# Preprocessing using NLP Techniques

    1. **Lowercasing:** -Lowercasing involves converting all letters in a text to lowercase. This is commonly used to ensure consistency in text processing and analysis.

    2. **Remove HTML Tags:** Removing HTML tags involves stripping out any HTML elements from a text string, leaving only the   plain text content.

    3. **Remove URLs:** Removing URLs involves eliminating any web addresses present in the text.

    4. **Chat Word Treatment:** Chat word treatment involves converting chat-style abbreviations or slang words into their formal equivalents.
    
    5. **Spelling Correction:** Spelling correction involves identifying and correcting misspelled words in a text.

    6. **Removing Stop Words:** Removing stop words involves eliminating common words (e.g., "the", "is", "and") that often don't add significant meaning to the text.


In [9]:
# Preprocessing Part-1
## Lowercasing
df['review'] = df['review'].str.lower()
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


In [11]:
# Removing HTML tags using regex
import re

def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

# Assuming df is your DataFrame and 'review' is the column containing text with HTML tags
df['review'] = df['review'].apply(remove_html_tags)
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [12]:
# Remove URLs using regex           regular expression -> https?://\S+|www\.\S+
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)
df['review'] = df['review'].apply(remove_url)
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


In [13]:
# remove punctuation 
def remove_punctuation(text):
    punctuation_pattern = r'[^\w\s]'
    clean_text = re.sub(punctuation_pattern, '', text)
    return clean_text

df['review'] = df['review'].apply(remove_punctuation)
df['review'][3]

'basically theres a family where a little boy jake thinks theres a zombie in his closet  his parents are fighting all the timethis movie is slower than a soap opera and suddenly jake decides to become rambo and kill the zombieok first of all when youre going to make a film you must decide if its a thriller or a drama as a drama the movie is watchable parents are divorcing  arguing like in real life and then we have jake with his closet which totally ruins all the film i expected to see a boogeyman similar movie and instead i watched a drama with some meaningless thriller spots3 out of 10 just for the well playing parents  descent dialogs as for the shots with jake just ignore them'

In [14]:
#removing chat words from particular column
def remove_chat_words(text):
    chat_words = ['lmao', 'rofl', 'lmfao', 'lol', 'haha']
    pattern = r'\b(?:{})\b'.format('|'.join(chat_words))
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

df['review'] = df['review'].apply(remove_chat_words)
df['review'][3]

'basically theres a family where a little boy jake thinks theres a zombie in his closet  his parents are fighting all the timethis movie is slower than a soap opera and suddenly jake decides to become rambo and kill the zombieok first of all when youre going to make a film you must decide if its a thriller or a drama as a drama the movie is watchable parents are divorcing  arguing like in real life and then we have jake with his closet which totally ruins all the film i expected to see a boogeyman similar movie and instead i watched a drama with some meaningless thriller spots3 out of 10 just for the well playing parents  descent dialogs as for the shots with jake just ignore them'

In [15]:
#Remove stopwords
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuvra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
df['review'] = df['review'].apply(remove_stopwords)


In [17]:
df['review'][3]

'basically theres family little boy jake thinks theres zombie closet parents fighting timethis movie slower soap opera suddenly jake decides become rambo kill zombieok first youre going make film must decide thriller drama drama movie watchable parents divorcing arguing like real life jake closet totally ruins film expected see boogeyman similar movie instead watched drama meaningless thriller spots3 10 well playing parents descent dialogs shots jake ignore'

***Exploratory Data Analysis***
***Following techniques have been performened for EDA on IMDB dataset***


**Tokenization and Counting Word Frequency:** Tokenization is the process of splitting a text into words or tokens. We'll tokenize the reviews and count the frequency of each word to find the most common words.

**Vocabulary Size:** We'll calculate the size of the vocabulary, which is the total number of unique words in the dataset after tokenization.

**Removing Swearing Words:** We'll identify and remove swearing words from the reviews.

**Sentiment Analysis:** We'll perform sentiment analysis to understand the distribution of sentiments (positive, negative, neutral) in the dataset.

**Plotting Word Cloud:** We'll create a word cloud visualization to display the most frequent words in the reviews.

**Finding N-grams:** We'll find common n-grams (sequences of n words) in the reviews, which can provide more context than individual words.

In [18]:
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [19]:
#Tokenization of the dataset
all_words = ' '.join(df['review']).split()
word_freq = Counter(all_words)
df['review'][3]
df['review'][4]

'petter matteis love time money visually stunning film watch mr mattei offers us vivid portrait human relations movie seems telling us money power success people different situations encounter variation arthur schnitzlers play theme director transfers action present time new york different characters meet connect one connected one way another next person one seems know previous point contact stylishly film sophisticated luxurious look taken see people live world live habitatthe thing one gets souls picture different stages loneliness one inhabits big city exactly best place human relations find sincere fulfillment one discerns case people encounterthe acting good mr matteis direction steve buscemi rosario dawson carol kane michael imperioli adrian grenier rest talented cast make characters come alivewe wish mr mattei good luck await anxiously next work'

In [21]:
# Vocabulary Size
vocab_size = len(word_freq)
print(vocab_size)

221746


In [32]:
# Removing Swearing Words
swear_words = ['crap', 'bullshits', 'horehounds', 'fuck', 'shit', 'fucking']
def remove_swearing(text):
    words = text.split()
    cleaned_words = [word for word in words if word.lower() not in swear_words]
    return ' '.join(cleaned_words)

df['review'] = df['review'].apply(remove_swearing)
print(f"Curse words count before cleaning: {curse_word_count_before}")
print(f"Curse words count after cleaning: {curse_word_count_after}")

Curse words count before cleaning: 0        0
1        0
2        0
3        0
4        0
        ..
49995    0
49996    1
49997    0
49998    0
49999    0
Name: review, Length: 50000, dtype: int64
Curse words count after cleaning: 0        0
1        0
2        0
3        0
4        0
        ..
49995    0
49996    1
49997    0
49998    0
49999    0
Name: review, Length: 50000, dtype: int64


In [37]:
# Sentiment analysis using logistic regression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['review'])
y = df['sentiment_generatem']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


KeyError: 'sentiment_generated'

In [36]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.8606
              precision    recall  f1-score   support

    negative       0.87      0.84      0.86      4961
    positive       0.85      0.88      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [None]:
df_check_accuracy = 