# Data Preprocessing and Sentiments Labelling

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from nltk.sentiment import SentimentIntensityAnalyzer
import pickle

In [2]:
df = pd.read_csv('Amazon_Smartphones_Reviews.csv')

In [3]:
df.head(5)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [4]:
print(df.shape)

(413840, 6)


## Removing duplicate and Null Values

In [5]:
null_mask   = df.isnull()            # Create a mask of null values
null_values = null_mask.sum().sum()  # Count the total number of null values
print("Number of null values:", null_values)

Number of null values: 83470


In [6]:
duplicates_mask = df.duplicated()       # Create a mask of duplicated rows
num_duplicates  = sum(duplicates_mask)  # Count the number of duplicate rows
print("Number of duplicate rows:", num_duplicates)

Number of duplicate rows: 64079


In [7]:
# Drop null rows with null values
df.dropna(inplace=True)

# Remove duplicates,
df.drop_duplicates(inplace=True)


In [8]:
null_mask   = df.isnull()            # Create a mask of null values
null_values = null_mask.sum().sum()  # Count the total number of null values
print("Number of null values:", null_values)

duplicates_mask = df.duplicated()       # Create a mask of duplicated rows
num_duplicates  = sum(duplicates_mask)  # Count the number of duplicate rows
print("Number of duplicate rows:", num_duplicates)

Number of null values: 0
Number of duplicate rows: 0


## Rename Brand names to lowercase

In [9]:
df['Brand Name'] = df['Brand Name'].apply(lambda x: x.lower())
df.head(5)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [10]:
print(df.shape)

(281249, 6)


## Text Preprocessing

In [11]:
def preprocess_text(text):

    # Remove special characters, numbers, and punctuation, 
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    

    return processed_text

In [12]:
# apply the function df
df['Reviews'] = df['Reviews'].apply(preprocess_text)

In [13]:
# Display the first 5 full reviews with a space in between
for index, row in df.head(5).iterrows():
    print(f"Review {index + 1}: {row['Reviews']}\n")

Review 1: feel lucky found used phone u used hard phone line someone upgraded sold one son liked old one finally fell apart year didnt want upgrade thank seller really appreciate honesty said used phonei recommend seller highly would

Review 2: nice phone nice grade pantach revue clean set easy set never android phone fantastic say least perfect size surfing social medium great phone samsung

Review 3: pleased

Review 4: work good go slow sometimes good phone love

Review 5: great phone replace lost phone thing volume button work still go setting adjust job eligible upgrade phone againthaanks



In [14]:
# Step 2: Label Data (Mapping rating to sentiment)
df['sentiment'] = df['Rating'].apply(lambda x: 'positive' if x > 3 else ('negative' if x < 3 else 'neutral'))

## NLTK Sentiment Score Assignment

In [15]:
sia    = SentimentIntensityAnalyzer()
scores = df['Reviews'].apply(lambda x: sia.polarity_scores(x))
scores = scores.apply(pd.Series)

In [16]:
scores.head(5)

Unnamed: 0,neg,neu,pos,compound
0,0.1,0.592,0.308,0.8966
1,0.155,0.445,0.4,0.8548
2,0.0,0.0,1.0,0.4404
3,0.0,0.333,0.667,0.875
4,0.103,0.714,0.183,0.4215


In [18]:
df = pd.concat([df,scores], axis=1)
# Drop null rows with null values
df = df.dropna(inplace=True)

In [19]:
df

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,sentiment,neg,neu,pos,compound
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,5,feel lucky found used phone u used hard phone ...,1.0,positive,0.100,0.592,0.308,0.8966
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,nice phone nice grade pantach revue clean set ...,0.0,positive,0.155,0.445,0.400,0.8548
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,5,pleased,0.0,positive,0.000,0.000,1.000,0.4404
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,work good go slow sometimes good phone love,0.0,positive,0.000,0.333,0.667,0.8750
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,great phone replace lost phone thing volume bu...,0.0,positive,0.103,0.714,0.183,0.4215
...,...,...,...,...,...,...,...,...,...,...,...
413825,Samsung Convoy U640 Phone for Verizon Wireless...,samsung,79.95,5,great phone large key best flip phone owned,0.0,positive,0.000,0.420,0.580,0.8519
413826,Samsung Convoy U640 Phone for Verizon Wireless...,samsung,79.95,5,prosworks great durable easy navigate speaker ...,0.0,positive,0.198,0.428,0.374,0.5106
413827,Samsung Convoy U640 Phone for Verizon Wireless...,samsung,79.95,5,described perfect price,0.0,positive,0.000,0.351,0.649,0.5719
413828,Samsung Convoy U640 Phone for Verizon Wireless...,samsung,79.95,1,would work,0.0,negative,0.000,1.000,0.000,0.0000


In [20]:
#Save to CSV 
df.to_csv('Preprocess_Reviews.csv', index = False)

In [31]:
df = pd.read_csv('Preprocess_Reviews.csv')

In [32]:
df

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,sentiment,neg,neu,pos,compound
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,5,feel lucky found used phone u used hard phone ...,1.0,positive,0.100,0.592,0.308,0.8966
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,nice phone nice grade pantach revue clean set ...,0.0,positive,0.155,0.445,0.400,0.8548
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,5,pleased,0.0,positive,0.000,0.000,1.000,0.4404
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,work good go slow sometimes good phone love,0.0,positive,0.000,0.333,0.667,0.8750
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",samsung,199.99,4,great phone replace lost phone thing volume bu...,0.0,positive,0.103,0.714,0.183,0.4215
...,...,...,...,...,...,...,...,...,...,...,...
281244,Samsung Convoy U640 Phone for Verizon Wireless...,samsung,79.95,5,great phone large key best flip phone owned,0.0,positive,0.000,0.420,0.580,0.8519
281245,Samsung Convoy U640 Phone for Verizon Wireless...,samsung,79.95,5,prosworks great durable easy navigate speaker ...,0.0,positive,0.198,0.428,0.374,0.5106
281246,Samsung Convoy U640 Phone for Verizon Wireless...,samsung,79.95,5,described perfect price,0.0,positive,0.000,0.351,0.649,0.5719
281247,Samsung Convoy U640 Phone for Verizon Wireless...,samsung,79.95,1,would work,0.0,negative,0.000,1.000,0.000,0.0000


In [33]:
# Make Sure Again the NaN Values
# Drop null rows with null values
df.dropna(inplace=True)

In [34]:
df_Positive = df[df['sentiment'] == 'positive'][0:20000]
df_Neutral = df[df['sentiment'] == 'neutral'][0:20000]
df_Negative = df[df['sentiment'] == 'negative'][0:20000]

In [35]:
df = pd.concat([df_Positive, df_Neutral, df_Negative], axis=0)

In [36]:
X = df["Reviews"].values
y = df['sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [37]:
# Step 2: Feature Extraction
# Convert text data into numerical feature vectors
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [38]:
# Step 3: Model Training
# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_vectors, y_train)

In [39]:
# Step 4: Model Evaluation
# Evaluate the model
y_pred = clf.predict(X_test_vectors)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7712777777777777
Classification Report:
              precision    recall  f1-score   support

    negative       0.78      0.78      0.78      5999
     neutral       0.70      0.70      0.70      5987
    positive       0.83      0.83      0.83      6014

    accuracy                           0.77     18000
   macro avg       0.77      0.77      0.77     18000
weighted avg       0.77      0.77      0.77     18000



In [40]:
# Step 5: Fine-tuning
# You can fine-tune the model parameters using GridSearchCV
param_grid = {'alpha': [0.1, 0.5, 1.0]}  # Example values for alpha
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train_vectors, y_train)

In [41]:
best_clf = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'alpha': 0.1}


In [42]:
# Re-evaluate the best model
y_pred_best = best_clf.predict(X_test_vectors)
accuracy_best = accuracy_score(y_test, y_pred_best)
print("Accuracy (Best Model):", accuracy_best)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))

Accuracy (Best Model): 0.771
Classification Report (Best Model):
              precision    recall  f1-score   support

    negative       0.78      0.78      0.78      5999
     neutral       0.70      0.70      0.70      5987
    positive       0.83      0.83      0.83      6014

    accuracy                           0.77     18000
   macro avg       0.77      0.77      0.77     18000
weighted avg       0.77      0.77      0.77     18000

