# Introduction
In this exercise, a Natual Language Processing algorithm will be created using the Amazon Reviews dataset: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews?select=train.ft.txt.bz2

For this exercise, a sentiment analysis will be done.

In [1]:
# Install langdetect if you haven't already
#%pip install langdetect

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import the necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import bz2
import re
from nltk.tokenize.regexp import regexp_tokenize
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [3]:
train_file = bz2.BZ2File("train.ft.txt.bz2")
test_file = bz2.BZ2File("test.ft.txt.bz2")

# Load and decode
lines_train = [x.decode('utf-8') for x in train_file.readlines()]
lines_test = [x.decode('utf-8') for x in test_file.readlines()]

# Split in two: sentiment and review
score_review_list_train = [re.split("\s+", l.strip('__label__'), 1) for l in lines_train]
score_review_list_test = [re.split("\s+", l.strip('__label__'), 1) for l in lines_test]

# Create dataframe and take random sample to speed up execution time
train = pd.DataFrame(score_review_list_train, columns = ['score', 'review'])\
.sample(n = 40000, random_state = 21).reset_index(drop = True)
test = pd.DataFrame(score_review_list_test, columns = ['score', 'review'] )\
.sample(n = 16000, random_state = 21).reset_index(drop = True)

train.head()

Unnamed: 0,score,review
0,2,Thigh hi compression stockinstockings: Excelle...
1,2,Worth the Journey: Few artists can capture a m...
2,2,In The Kitchen with Rosie has the best ever ch...
3,2,"Incredible suspense, strong values and a book ..."
4,2,"A source of jubilation: At last, In France, a ..."


In [4]:
# Create n_tokens feature
train['n_tokens'] = [len(regexp_tokenize(line, '\w+')) for line in train.review]
test['n_tokens'] = [len(regexp_tokenize(line, '\w+')) for line in test.review]

train.head()

Unnamed: 0,score,review,n_tokens
0,2,Thigh hi compression stockinstockings: Excelle...,24
1,2,Worth the Journey: Few artists can capture a m...,156
2,2,In The Kitchen with Rosie has the best ever ch...,70
3,2,"Incredible suspense, strong values and a book ...",31
4,2,"A source of jubilation: At last, In France, a ...",153


In [5]:
train.shape

(40000, 3)

In [6]:
languages = []

for i in train.review:
    try:
        lang = detect(i)
    except:
        lang = "not_detected"
    languages.append(lang)

train['language'] = languages

train.head()

Unnamed: 0,score,review,n_tokens,language
0,2,Thigh hi compression stockinstockings: Excelle...,24,en
1,2,Worth the Journey: Few artists can capture a m...,156,en
2,2,In The Kitchen with Rosie has the best ever ch...,70,en
3,2,"Incredible suspense, strong values and a book ...",31,en
4,2,"A source of jubilation: At last, In France, a ...",153,en


In [7]:
languages = []

for i in test.review:
    try:
        lang = detect(i)
    except:
        lang = "not_detected"
    languages.append(lang)

    

test['language'] = languages

test.head()

Unnamed: 0,score,review,n_tokens,language
0,1,BUTTONS STINK...he's right!!!: I got this as m...,105,en
1,1,the GOOD and the BAD: dont think that it is a ...,34,en
2,2,Uniden Clearer than Radio Shack Scanners: This...,63,en
3,2,Sara Ivanhoe Knows How to Help You Relax: This...,75,en
4,1,Broken on receipt: A strand was broken and two...,43,en


## Create tokenize using bag-of-words

In [8]:
print(train.language.unique())
print(test.language.unique())

['en' 'pt' 'es' 'so' 'fr' 'id' 'af' 'pl' 'de' 'it' 'sq' 'hu' 'da']
['en' 'sq' 'es' 'fr' 'de' 'pt' 'id' 'cy' 'so' 'da']


In [9]:
# Create list of stopwords based on languages
stop_words = ['english', 'german', 'french', 'spanish', 'portuguese', 'polish', 'italian']

count_vect = CountVectorizer(stop_words = stop_words, max_features = 1000, min_df = 50)

# Create the bag-of-words vector for the train and test set
count_train = count_vect.fit_transform(train.review)
count_test = count_vect.transform(test.review)

# Create X_train and X_test based on vectors 
X_train = pd.DataFrame(count_train.A, columns = count_vect.get_feature_names())
X_test = pd.DataFrame(count_test.A, columns = count_vect.get_feature_names())

# Add the n_tokens and language columns
# NOTE: language will be converted to boolean: 1 for English, 0 for non-English
X_train['n_tokens'], X_train['language'] = train.n_tokens, np.where(train['language'] == 'en', 1, 0)
X_test['n_tokens'], X_test['language'] = test.n_tokens, np.where(test['language'] == 'en', 1, 0)

# Set y_train and y_test based on the score column
y_train = train.score
y_test = test.score

X_train.head()

Unnamed: 0,10,100,12,15,20,30,50,able,about,absolutely,...,wrote,year,years,yes,yet,you,young,your,yourself,n_tokens
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24
1,0,0,0,0,0,0,0,0,0,0,...,0,1,2,0,0,1,1,0,1,156
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,70
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,31
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,153


In [10]:
print(X_train.shape)
print(X_test.shape)

(40000, 1001)
(16000, 1001)


In [11]:
# Create an instance for MultinomialNB classifier
multi_nb = MultinomialNB(alpha = 0.5)

# Train the data based on the train set
multi_nb.fit(X_train, y_train)

# Create the predictions
pred = multi_nb.predict(X_test)

# Get the accuracy and print it
acc = accuracy_score(y_test, pred)

print(f"Accuracy score: {acc}")

# Get the confusion matrix and print it
conf_matrix = confusion_matrix(y_test, pred)
print(conf_matrix)

Accuracy score: 0.8368125
[[6606 1314]
 [1297 6783]]
