# Gathering Data and Preprocessing

### 1. Gathering Data

In [1]:
import pandas as pd
import random

# Number of neutral samples
num_neutral = 10000

# Pool of neutral phrases
neutral_phrases = [
    "It is okay.",
    "Average experience.",
    "Nothing special.",
    "Fine, nothing more.",
    "It was acceptable.",
    "Mediocre service.",
    "Not bad, not great.",
    "Could be better.",
    "Ordinary experience.",
    "Satisfactory."
]

# Randomly select 10k phrases
neutral_texts = [random.choice(neutral_phrases) for _ in range(num_neutral)]

# Create DataFrame
neutral_df = pd.DataFrame({
    'review': neutral_texts,
    'sentiment': ['neutral'] * num_neutral
})

# Lowercase for consistency
neutral_df['clean_text'] = neutral_df['review'].str.lower()


In [2]:
import pandas as pd

file_path = r'/home/ahmed-sameh/Opinion-Mining-Project/data/IMDB Dataset.csv'

df = pd.read_csv(file_path)

print(df)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [3]:
# Combine original dataset (50k) with neutral comments (10k)
df_augmented = pd.concat([df, neutral_df], ignore_index=True)

# Shuffle the dataset so neutral samples are not all at the end
df_augmented = df_augmented.sample(frac=1, random_state=42).reset_index(drop=True)
df = df_augmented
print("Augmented dataset size:", len(df))

Augmented dataset size: 60000


### 2. Lowercasing

In [4]:
df['review'] 
print(type(df['review']))  
df['clean_text'] = df['review'].apply(lambda word:word.lower())
df[['review', 'clean_text']]


<class 'pandas.core.series.Series'>


Unnamed: 0,review,clean_text
0,"""Paula, I may be a bitch, but I'll never be a ...","""paula, i may be a bitch, but i'll never be a ..."
1,Many people here say that this show is for kid...,many people here say that this show is for kid...
2,This was a well written tale of the Making of ...,this was a well written tale of the making of ...
3,I think this movie is absolutely beautiful. An...,i think this movie is absolutely beautiful. an...
4,The film was very outstanding despite the NC-1...,the film was very outstanding despite the nc-1...
...,...,...
59995,Nothing special.,nothing special.
59996,Avoid this one! It is a terrible movie. So wha...,avoid this one! it is a terrible movie. so wha...
59997,This production was quite a surprise for me. I...,this production was quite a surprise for me. i...
59998,This is a decent movie. Although little bit sh...,this is a decent movie. although little bit sh...


###  3. Remove Punctuation and Symbols

In [5]:
import string
print(string.punctuation)
mytable = str.maketrans('','',string.punctuation)
df['clean_text'] = df['clean_text'].apply(lambda word : word.translate(mytable)) 
df['clean_text']


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


0        paula i may be a bitch but ill never be a butc...
1        many people here say that this show is for kid...
2        this was a well written tale of the making of ...
3        i think this movie is absolutely beautiful and...
4        the film was very outstanding despite the nc17...
                               ...                        
59995                                      nothing special
59996    avoid this one it is a terrible movie so what ...
59997    this production was quite a surprise for me i ...
59998    this is a decent movie although little bit sho...
59999                                         satisfactory
Name: clean_text, Length: 60000, dtype: object

### 4. Remove Stopwords

In [6]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if needed
nltk.download('stopwords')

# Get English stopwords
stop_words = set(stopwords.words('english'))

# Keep important negations
negations = {"not", "no", "never", "none"}
stop_words = stop_words - negations

# Remove stopwords but keep negations
df['no_stopwords'] = df['clean_text'].apply(
    lambda text: ' '.join(word for word in text.split() if word not in stop_words)
)

# Check the results
df['no_stopwords'].head()

[nltk_data] Downloading package stopwords to /home/ahmed-
[nltk_data]     sameh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    paula may bitch ill never butch br br hilariou...
1    many people say show kids hm kid approximately...
2    well written tale making batman sitcom actuall...
3    think movie absolutely beautiful im not referr...
4    film outstanding despite nc17 rating disturbin...
Name: no_stopwords, dtype: object

### 5. Tokenization

In [7]:
import re, time

def tokenizer(text):
    return re.findall(r'\b\w+\b', str(text).lower())

start = time.time()
df['tokenized'] = df['no_stopwords'].fillna('').astype(str).apply(tokenizer)
end = time.time()

print(df[['no_stopwords', 'tokenized']].head())
print(f"Processed {len(df)} rows in {end - start:.2f} seconds")

output_path = "/home/ahmed-sameh/Opinion-Mining-Project/data/test_sample_processed.csv"
df.to_csv(output_path, index=False)
print(f"Saved processed file to {output_path}")
df

                                        no_stopwords  \
0  paula may bitch ill never butch br br hilariou...   
1  many people say show kids hm kid approximately...   
2  well written tale making batman sitcom actuall...   
3  think movie absolutely beautiful im not referr...   
4  film outstanding despite nc17 rating disturbin...   

                                           tokenized  
0  [paula, may, bitch, ill, never, butch, br, br,...  
1  [many, people, say, show, kids, hm, kid, appro...  
2  [well, written, tale, making, batman, sitcom, ...  
3  [think, movie, absolutely, beautiful, im, not,...  
4  [film, outstanding, despite, nc17, rating, dis...  
Processed 60000 rows in 2.89 seconds
Saved processed file to /home/ahmed-sameh/Opinion-Mining-Project/data/test_sample_processed.csv


Unnamed: 0,review,sentiment,clean_text,no_stopwords,tokenized
0,"""Paula, I may be a bitch, but I'll never be a ...",negative,paula i may be a bitch but ill never be a butc...,paula may bitch ill never butch br br hilariou...,"[paula, may, bitch, ill, never, butch, br, br,..."
1,Many people here say that this show is for kid...,negative,many people here say that this show is for kid...,many people say show kids hm kid approximately...,"[many, people, say, show, kids, hm, kid, appro..."
2,This was a well written tale of the Making of ...,positive,this was a well written tale of the making of ...,well written tale making batman sitcom actuall...,"[well, written, tale, making, batman, sitcom, ..."
3,I think this movie is absolutely beautiful. An...,positive,i think this movie is absolutely beautiful and...,think movie absolutely beautiful im not referr...,"[think, movie, absolutely, beautiful, im, not,..."
4,The film was very outstanding despite the NC-1...,positive,the film was very outstanding despite the nc17...,film outstanding despite nc17 rating disturbin...,"[film, outstanding, despite, nc17, rating, dis..."
...,...,...,...,...,...
59995,Nothing special.,neutral,nothing special,nothing special,"[nothing, special]"
59996,Avoid this one! It is a terrible movie. So wha...,negative,avoid this one it is a terrible movie so what ...,avoid one terrible movie exciting pointless mu...,"[avoid, one, terrible, movie, exciting, pointl..."
59997,This production was quite a surprise for me. I...,positive,this production was quite a surprise for me i ...,production quite surprise absolutely love obsc...,"[production, quite, surprise, absolutely, love..."
59998,This is a decent movie. Although little bit sh...,positive,this is a decent movie although little bit sho...,decent movie although little bit short time pa...,"[decent, movie, although, little, bit, short, ..."


### 6. Lemmatization Function

In [8]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import nltk
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('punkt_tab')
def get_wordnet_pos(treebank_tag):
    # Map POS tag to WordNet POS tag for lemmatizer
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default
def lemmatize_tokens(token_list):
    #Takes a list of words (tokens) from your sentence.
    #Assigns a Part-of-Speech (POS) tag to each word, like noun, verb, adjective, etc.
    #Returns a list of tuples: (word, POS_tag) for every token.
    pos_tags = pos_tag(token_list)
    # print("POS tag : ",pos_tags)
    lemmatizer = WordNetLemmatizer()
    # print(lemmatizer)
    lemmatized_sentence = []
    for token, tag in pos_tags:
        wn_tag = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(token, wn_tag)
        lemmatized_sentence.append(lemma)
    
    # print("Original:", token_list)
    # print("Lemmatized:", " ".join(lemmatized_sentence))
    # "bats" → "bat"

    # "are" → "be"

    # "hanging" → "hang"

    # "feet" → "foot"    
    return lemmatized_sentence

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/ahmed-sameh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ahmed-
[nltk_data]     sameh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ahmed-
[nltk_data]     sameh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### 7. Lemmatization

In [9]:
import time
from tqdm import tqdm
start = time.time()
tqdm.pandas()
df['lemmatized'] = df['tokenized'].progress_apply(lemmatize_tokens)
df['lemmatized'] = df['lemmatized'].progress_apply(lambda tokens: " ".join(tokens))

end = time.time()
print(f"Time for 50k rows: {end - start:.2f} sec")

  0%|          | 0/60000 [00:00<?, ?it/s]

100%|██████████| 60000/60000 [06:04<00:00, 164.42it/s]
100%|██████████| 60000/60000 [00:00<00:00, 293259.78it/s]


Time for 50k rows: 365.26 sec


### 8. Save to CSV file

In [10]:
df.to_csv("../../data/data_processed.csv")
df

Unnamed: 0,review,sentiment,clean_text,no_stopwords,tokenized,lemmatized
0,"""Paula, I may be a bitch, but I'll never be a ...",negative,paula i may be a bitch but ill never be a butc...,paula may bitch ill never butch br br hilariou...,"[paula, may, bitch, ill, never, butch, br, br,...",paula may bitch ill never butch br br hilariou...
1,Many people here say that this show is for kid...,negative,many people here say that this show is for kid...,many people say show kids hm kid approximately...,"[many, people, say, show, kids, hm, kid, appro...",many people say show kid hm kid approximately ...
2,This was a well written tale of the Making of ...,positive,this was a well written tale of the making of ...,well written tale making batman sitcom actuall...,"[well, written, tale, making, batman, sitcom, ...",well write tale make batman sitcom actually re...
3,I think this movie is absolutely beautiful. An...,positive,i think this movie is absolutely beautiful and...,think movie absolutely beautiful im not referr...,"[think, movie, absolutely, beautiful, im, not,...",think movie absolutely beautiful im not refer ...
4,The film was very outstanding despite the NC-1...,positive,the film was very outstanding despite the nc17...,film outstanding despite nc17 rating disturbin...,"[film, outstanding, despite, nc17, rating, dis...",film outstanding despite nc17 rating disturb s...
...,...,...,...,...,...,...
59995,Nothing special.,neutral,nothing special,nothing special,"[nothing, special]",nothing special
59996,Avoid this one! It is a terrible movie. So wha...,negative,avoid this one it is a terrible movie so what ...,avoid one terrible movie exciting pointless mu...,"[avoid, one, terrible, movie, exciting, pointl...",avoid one terrible movie excite pointless murd...
59997,This production was quite a surprise for me. I...,positive,this production was quite a surprise for me i ...,production quite surprise absolutely love obsc...,"[production, quite, surprise, absolutely, love...",production quite surprise absolutely love obsc...
59998,This is a decent movie. Although little bit sh...,positive,this is a decent movie although little bit sho...,decent movie although little bit short time pa...,"[decent, movie, although, little, bit, short, ...",decent movie although little bit short time pa...
