# Data Preprocessing and Sentiment Score NLTK 

#### Import Libraries

In [2]:
import pandas as pd
import string
from nltk.corpus import stopwords
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer


#### Import Dataset

In [15]:
df = pd.read_csv('Amazon_Smartphones_Reviews.csv')
#df = pd.read_csv('short_Amazon_Smartphones_Reviews_Cleaned_NLTK_ScoresFFs.csv')

In [None]:
df.head(5)


In [7]:
print(df.shape)

(413840, 6)


#### Remove duplicate and null values

In [8]:
null_mask   = df.isnull()            # Create a mask of null values
null_values = null_mask.sum().sum()  # Count the total number of null values
print("Number of null values:", null_values)

Number of null values: 83470


In [9]:
duplicates_mask = df.duplicated()       # Create a mask of duplicated rows
num_duplicates  = sum(duplicates_mask)  # Count the number of duplicate rows
print("Number of duplicate rows:", num_duplicates)

Number of duplicate rows: 64079


In [10]:
# Drop null rows with null values
df    = df.dropna()

# Remove duplicates,
df.drop_duplicates(inplace=True)


In [11]:
null_mask   = df.isnull()            # Create a mask of null values
null_values = null_mask.sum().sum()  # Count the total number of null values
print("Number of null values:", null_values)

duplicates_mask = df.duplicated()       # Create a mask of duplicated rows
num_duplicates  = sum(duplicates_mask)  # Count the number of duplicate rows
print("Number of duplicate rows:", num_duplicates)

Number of null values: 0
Number of duplicate rows: 0


Rename Brand names to lowercase

In [12]:
df['Brand Name'] = df['Brand Name'].apply(lambda x: x.lower())
df.head(5)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [13]:
print(df.shape)

(281249, 6)


### Text Preprocessing 

In [11]:
def preprocess_text(text):

    # Remove special characters, numbers, and punctuation, 
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    

    return processed_text

In [12]:
# apply the function df
df['Reviews'] = df['Reviews'].apply(preprocess_text)

In [13]:
# Display the first 5 full reviews with a space in between
for index, row in df.head(5).iterrows():
    print(f"Review {index + 1}: {row['Reviews']}\n")

Review 1: feel lucky found used phone u used hard phone line someone upgraded sold one son liked old one finally fell apart year didnt want upgrade thank seller really appreciate honesty said used phonei recommend seller highly would

Review 2: nice phone nice grade pantach revue clean set easy set never android phone fantastic say least perfect size surfing social medium great phone samsung

Review 3: pleased

Review 4: work good go slow sometimes good phone love

Review 5: great phone replace lost phone thing volume button work still go setting adjust job eligible upgrade phone againthaanks



### NLTK Sentiment Score Assignment 

In [14]:
sia    = SentimentIntensityAnalyzer()
scores = df['Reviews'].apply(lambda x: sia.polarity_scores(x))
scores = scores.apply(pd.Series)

In [15]:
scores.head(5)

Unnamed: 0,neg,neu,pos,compound
0,0.1,0.592,0.308,0.8966
1,0.155,0.445,0.4,0.8548
2,0.0,0.0,1.0,0.4404
3,0.0,0.333,0.667,0.875
4,0.103,0.714,0.183,0.4215


In [16]:
df['Sentiments']  = scores['compound'].apply(lambda x: 1 if x > 0 else 0)
df                = pd.concat([df,scores], axis=1)

# Drop null rows with null values
df    = df.dropna()

In [17]:
df.head(5)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Sentiments,neg,neu,pos,compound
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,5,feel lucky found used phone u used hard phone ...,1.0,1,0.1,0.592,0.308,0.8966
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,nice phone nice grade pantach revue clean set ...,0.0,1,0.155,0.445,0.4,0.8548
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,5,pleased,0.0,1,0.0,0.0,1.0,0.4404
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,work good go slow sometimes good phone love,0.0,1,0.0,0.333,0.667,0.875
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,great phone replace lost phone thing volume bu...,0.0,1,0.103,0.714,0.183,0.4215


In [18]:
#Save to CSV 
df.to_csv('Amazon_Smartphones_Reviews_Cleaned_NLTK_Score.csv', index = False)