In [22]:
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [31]:
# file_path = '../data/raw/IMDB Dataset.csv/IMDB Dataset.csv'
RAW_DATA_PATH = os.path.join('..', 'data', 'raw', 'IMDB Dataset.csv')
df = pd.read_csv(RAW_DATA_PATH)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [15]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [16]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [19]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [28]:
PROCESSED_DATA_PATH = os.path.join('..', 'data', 'experiment', 'IMDB_experiment.csv')

In [21]:
# Load the Data
def load_data(file_path):

    #Loads data from a CSV file into a pandas DataFrame

    print(f"Loading data from: {file_path}")
    try:
        df = pd.read_csv(file_path)
        print(f"Data loaded successfully. Shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred while loading data: {e}")
        return None

In [23]:
# Data cleaning
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [24]:
def tokenize_text(text):
    #Tokenize text into words
    return word_tokenize(text)

In [25]:
def remove_stopwords(words):
    """Removes common English stopwords from a list of words."""
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

In [26]:
def lemmatize_text(words):
    """Lemmatizes words to their base form."""
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

In [27]:
# --- Preprocessing Pipeline ---
def preprocess_pipeline(df, text_column='review'):
    """Applies a full preprocessing pipeline to the specified text column."""
    if df is None:
        return None

    print("Starting text preprocessing pipeline...")
    # Apply cleaning
    df['cleaned_review'] = df[text_column].apply(clean_text)
    # Apply tokenization
    df['tokens'] = df['cleaned_review'].apply(tokenize_text)
    # Apply stop word removal
    df['tokens_no_stopwords'] = df['tokens'].apply(remove_stopwords)
    # Apply lemmatization
    df['lemmas'] = df['tokens_no_stopwords'].apply(lemmatize_text)

    # Convert list of lemmas back to string for easier storage/analysis if needed
    df['processed_text'] = df['lemmas'].apply(lambda x: ' '.join(x))

    print("Preprocessing complete.")
    return df

In [30]:
# --- Main Execution Block ---
if __name__ == "__main__":
    # Load the data
    df = load_data(RAW_DATA_PATH)

    if df is not None:
        # Preprocess the data
        processed_df = preprocess_pipeline(df.copy()) # Use a copy to avoid modifying original df

        if processed_df is not None:
            # Display sample of original and processed text
            print("\n--- Sample Original vs. Processed Text ---")
            print(processed_df[['review', 'processed_text']].head())

            # Display counts of tokens before/after stopword removal and lemmatization
            print("\n--- Token Counts Sample ---")
            processed_df['original_token_count'] = processed_df['tokens'].apply(len)
            processed_df['processed_token_count'] = processed_df['lemmas'].apply(len)
            print(processed_df[['original_token_count', 'processed_token_count']].head())

            # Save the processed data
            # Ensure the processed directory exists
            os.makedirs(os.path.dirname(PROCESSED_DATA_PATH), exist_ok=True)
            processed_df.to_csv(PROCESSED_DATA_PATH, index=False)
            print(f"\nProcessed data saved to: {PROCESSED_DATA_PATH}")
            print("Columns in processed DataFrame:", processed_df.columns.tolist())

Loading data from: ..\data\raw\IMDB Dataset.csv\IMDB Dataset.csv
Data loaded successfully. Shape: (50000, 2)
Starting text preprocessing pipeline...
Preprocessing complete.

--- Sample Original vs. Processed Text ---
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      processed_text  
0  one reviewer mentioned watching oz episode you...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically there family little boy jake think t...  
4  petter matteis love time money visually stunni...  

--- Token Counts Sample ---
   original_token_count  processed_token_count
0                   300                 

In [32]:
df1 = pd.read_csv(PROCESSED_DATA_PATH)

In [34]:
df1.columns

Index(['review', 'sentiment', 'cleaned_review', 'tokens',
       'tokens_no_stopwords', 'lemmas', 'processed_text',
       'original_token_count', 'processed_token_count'],
      dtype='object')

In [35]:
df1.head()

Unnamed: 0,review,sentiment,cleaned_review,tokens,tokens_no_stopwords,lemmas,processed_text,original_token_count,processed_token_count
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"['one', 'of', 'the', 'other', 'reviewers', 'ha...","['one', 'reviewers', 'mentioned', 'watching', ...","['one', 'reviewer', 'mentioned', 'watching', '...",one reviewer mentioned watching oz episode you...,300,167
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...,"['a', 'wonderful', 'little', 'production', 'th...","['wonderful', 'little', 'production', 'filming...","['wonderful', 'little', 'production', 'filming...",wonderful little production filming technique ...,156,84
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"['i', 'thought', 'this', 'was', 'a', 'wonderfu...","['thought', 'wonderful', 'way', 'spend', 'time...","['thought', 'wonderful', 'way', 'spend', 'time...",thought wonderful way spend time hot summer we...,161,85
3,Basically there's a family where a little boy ...,negative,basically theres a family where a little boy j...,"['basically', 'theres', 'a', 'family', 'where'...","['basically', 'theres', 'family', 'little', 'b...","['basically', 'there', 'family', 'little', 'bo...",basically there family little boy jake think t...,128,66
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love in the time of money is a ...,"['petter', 'matteis', 'love', 'in', 'the', 'ti...","['petter', 'matteis', 'love', 'time', 'money',...","['petter', 'matteis', 'love', 'time', 'money',...",petter matteis love time money visually stunni...,222,125
