# Task 1 Data Preprocessing

*Note that only 10000 rows are used to reduce computational power needed.

In [1]:
import pandas as pd
from collections import Counter


In [2]:
data = pd.read_csv("C:/Users/Z4Teen/Desktop/Anaconda Jupiter/data/Amazon.csv", encoding="ISO-8859-1", nrows=10000)
data['Text'].astype(str).drop_duplicates().dropna()
data.head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
num_duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")


Number of duplicate rows: 0


In [4]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# Ensure you have the necessary NLTK data files
import nltk
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Z4Teen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Z4Teen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stop words and convert to lowercase
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.lower() not in stop_words]
    # Join words back into a single string
    cleaned_text = ' '.join(words)
    return cleaned_text


In [6]:
data['Cleaned_Text'] = data['Text'].astype(str).apply(clean_text)
print(data.head(5))


   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  \
0  Good Quality Dog Food  I have bought several of the Vitality canned d...   


In [7]:
data['Tokenized_Text'] = data['Cleaned_Text'].apply(word_tokenize)


In [8]:
data.head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Cleaned_Text,Tokenized_Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,"[bought, several, vitality, canned, dog, food,..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanutsth...,"[product, arrived, labeled, jumbo, salted, pea..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,confection around centuries light pillowy citr...,"[confection, around, centuries, light, pillowy..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,"[looking, secret, ingredient, robitussin, beli..."
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...,"[great, taffy, great, price, wide, assortment,..."


In [9]:
from nltk.stem import PorterStemmer

# Ensure you have the necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

def stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

data['Stemmed_Text'] = data['Tokenized_Text'].apply(stemming)
data.head(5)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Z4Teen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Z4Teen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Cleaned_Text,Tokenized_Text,Stemmed_Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,"[bought, several, vitality, canned, dog, food,...","[bought, sever, vital, can, dog, food, product..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanutsth...,"[product, arrived, labeled, jumbo, salted, pea...","[product, arriv, label, jumbo, salt, peanutsth..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,confection around centuries light pillowy citr...,"[confection, around, centuries, light, pillowy...","[confect, around, centuri, light, pillowi, cit..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,"[looking, secret, ingredient, robitussin, beli...","[look, secret, ingredi, robitussin, believ, fo..."
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...,"[great, taffy, great, price, wide, assortment,...","[great, taffi, great, price, wide, assort, yum..."


In [10]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatization(text):
    lem_text = []
    for word in text:
        lem_word = lemmatizer.lemmatize(word)
        lem_text.append(lem_word)
    return lem_text

data['Lemmatized_Text'] = data['Stemmed_Text'].apply(lemmatization)
data.head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Cleaned_Text,Tokenized_Text,Stemmed_Text,Lemmatized_Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,"[bought, several, vitality, canned, dog, food,...","[bought, sever, vital, can, dog, food, product...","[bought, sever, vital, can, dog, food, product..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanutsth...,"[product, arrived, labeled, jumbo, salted, pea...","[product, arriv, label, jumbo, salt, peanutsth...","[product, arriv, label, jumbo, salt, peanutsth..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,confection around centuries light pillowy citr...,"[confection, around, centuries, light, pillowy...","[confect, around, centuri, light, pillowi, cit...","[confect, around, centuri, light, pillowi, cit..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,"[looking, secret, ingredient, robitussin, beli...","[look, secret, ingredi, robitussin, believ, fo...","[look, secret, ingredi, robitussin, believ, fo..."
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...,"[great, taffy, great, price, wide, assortment,...","[great, taffi, great, price, wide, assort, yum...","[great, taffi, great, price, wide, assort, yum..."


# Task 2 Feature Extraction

In [11]:
import math
from collections import Counter

In [12]:
def compute_tf(document):
    word_count = Counter(document)
    tf = {word: count/len(document) for word, count in word_count.items()}
    return tf

def compute_idf(documents):
    N = len(documents)
    idf = {}
    all_words = set(word for doc in documents for word in doc)
    for word in all_words:
        count = sum(1 for doc in documents if word in doc)
        idf[word] = math.log(N/count)
    return idf

def compute_tfidf(document, idf):
    tfidf = {}
    tf = compute_tf(document)
    for word, tf_value in tf.items():
        tfidf[word] = tf_value * idf[word]
    return tfidf

In [13]:

# Compute TF for each document
tf_data = [compute_tf(doc) for doc in data['Lemmatized_Text']]

# Create DataFrame for TF
tf_df = pd.DataFrame(tf_data).fillna(0)
print("TF Scores:")
print(tf_df)

# Compute IDF
idf = compute_idf(data['Lemmatized_Text'])
idf_df = pd.DataFrame([idf]).fillna(0)
print("\nIDF Scores:")
print(idf_df)

# Compute TF-IDF for each document
tfidf_data = [compute_tfidf(doc, idf) for doc in data['Lemmatized_Text']]

# Create DataFrame for TF-IDF
tfidf_df = pd.DataFrame(tfidf_data).fillna(0)
print("\nTF-IDF Scores:")
print(tfidf_df)

TF Scores:
        bought     sever     vital       can       dog      food   product  \
0     0.043478  0.043478  0.043478  0.043478  0.043478  0.043478  0.130435   
1     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.111111   
2     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
3     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.000000  0.000000  0.000000  0.000000  0.000000  0.024390  0.048780   
9996  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
9997  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.013158   
9998  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
9999  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

         found      good   qualiti  ...  ulterior  s

In [14]:
data['Score'] = data['Score'].replace({1: 'negative', 2: 'negative', 3: 'neutral', 4: 'positive', 5: 'positive'})
data.head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Cleaned_Text,Tokenized_Text,Stemmed_Text,Lemmatized_Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,"[bought, several, vitality, canned, dog, food,...","[bought, sever, vital, can, dog, food, product...","[bought, sever, vital, can, dog, food, product..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanutsth...,"[product, arrived, labeled, jumbo, salted, pea...","[product, arriv, label, jumbo, salt, peanutsth...","[product, arriv, label, jumbo, salt, peanutsth..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,confection around centuries light pillowy citr...,"[confection, around, centuries, light, pillowy...","[confect, around, centuri, light, pillowi, cit...","[confect, around, centuri, light, pillowi, cit..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,"[looking, secret, ingredient, robitussin, beli...","[look, secret, ingredi, robitussin, believ, fo...","[look, secret, ingredi, robitussin, believ, fo..."
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...,"[great, taffy, great, price, wide, assortment,...","[great, taffi, great, price, wide, assort, yum...","[great, taffi, great, price, wide, assort, yum..."


# Task 3 Model Selection

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tabulate import tabulate


In [16]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the WordNet Lemmatizer and Porter Stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Stem the words
    words = [stemmer.stem(word) for word in words]
    # Join words back into a single string
    cleaned_text = ' '.join(words)
    return cleaned_text


data['clean_text'] = data['Cleaned_Text'].astype(str).apply(preprocess)
data.head(5)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Z4Teen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Z4Teen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Z4Teen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Cleaned_Text,Tokenized_Text,Stemmed_Text,Lemmatized_Text,clean_text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,"[bought, several, vitality, canned, dog, food,...","[bought, sever, vital, can, dog, food, product...","[bought, sever, vital, can, dog, food, product...",bought sever vital can dog food product found ...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanutsth...,"[product, arrived, labeled, jumbo, salted, pea...","[product, arriv, label, jumbo, salt, peanutsth...","[product, arriv, label, jumbo, salt, peanutsth...",product arriv label jumbo salt peanutsth peanu...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,confection around centuries light pillowy citr...,"[confection, around, centuries, light, pillowy...","[confect, around, centuri, light, pillowi, cit...","[confect, around, centuri, light, pillowi, cit...",confect around centuri light pillowi citru gel...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...,"[looking, secret, ingredient, robitussin, beli...","[look, secret, ingredi, robitussin, believ, fo...","[look, secret, ingredi, robitussin, believ, fo...",look secret ingredi robitussin believ found go...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...,"[great, taffy, great, price, wide, assortment,...","[great, taffi, great, price, wide, assort, yum...","[great, taffi, great, price, wide, assort, yum...",great taffi great price wide assort yummi taff...


In [25]:
output_file_path = 'extracteddata.csv'
data.to_csv(output_file_path, index=False)

print("Extracted data is saved to:", output_file_path)

Extracted data is saved to: extracteddata.csv


In [17]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['clean_text'])
y = data['Score']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [19]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [20]:
y_pred = nb_classifier.predict(X_test)


# Task 4 Model Evaluation

In [21]:
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.7606666666666667
Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       480
     neutral       0.00      0.00      0.00       238
    positive       0.76      1.00      0.86      2282

    accuracy                           0.76      3000
   macro avg       0.25      0.33      0.29      3000
weighted avg       0.58      0.76      0.66      3000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)


In [23]:
# Predict sentiment using classifiers
for text, actual_label in zip(X_test, y_test):
    # Predict sentiment using Naive Bayes
    nb_prediction = nb_classifier.predict(text)[0]

    # Predict sentiment using SVM
    svm_prediction = svm_classifier.predict(text)[0]


In [24]:
# Calculate classification report for Naive Bayes
nb_classification_report = classification_report(y_test, nb_classifier.predict(X_test), target_names=['negative', 'neutral', 'positive'])

# Calculate classification report for SVM
svm_classification_report = classification_report(y_test, svm_classifier.predict(X_test), target_names=['negative', 'neutral', 'positive'])

# Print classification report for Naive Bayes
print("\nClassification Report for Naive Bayes:")
print(nb_classification_report)

# Print classification report for SVM
print("\nClassification Report for SVM:")
print(svm_classification_report)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report for Naive Bayes:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       480
     neutral       0.00      0.00      0.00       238
    positive       0.76      1.00      0.86      2282

    accuracy                           0.76      3000
   macro avg       0.25      0.33      0.29      3000
weighted avg       0.58      0.76      0.66      3000


Classification Report for SVM:
              precision    recall  f1-score   support

    negative       0.76      0.48      0.59       480
     neutral       0.65      0.09      0.16       238
    positive       0.84      0.98      0.90      2282

    accuracy                           0.83      3000
   macro avg       0.75      0.52      0.55      3000
weighted avg       0.81      0.83      0.79      3000



# Task 5 Discussion

Naive Bayes 

Strength
Naive Bayes is suitable to handle large dataset
Easy to implement and understand

Weakness
Naive Bayes assumes all feature are equally independent, might lead to lower performance.



SVM

Strength
Efficient with dimensional data
Less prone to overfit

Weakness
Very computationally intensive with large datasets