# ***Sentiment Analyses***
__________________

In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [8]:
# A dataset on unlocked mobile phones sold on Amazon

In [9]:
amazon = pd.read_csv(r"D:/Applied-Text-Mining-in-Python/resources/Amazon_Unlocked_Mobile.csv")
amazon.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [10]:
amazon.columns

Index(['Product Name', 'Brand Name', 'Price', 'Rating', 'Reviews',
       'Review Votes'],
      dtype='object')

In [11]:
amazon.shape

(413840, 6)

In [12]:
# We'll focus only on the Rating & Reviews columns.

amazon.dropna(axis = 0, inplace = True)

# remove all records with a rating 3, assuming these are neutral on a rating system 0 - 5
amazon.drop(amazon.index[amazon.Rating == 3], axis = 0, inplace = True)

# A new column that binarily labels the data
# If review > 3, positively rated (1) else not-positively rated. (0)

amazon["positively_rated"] = amazon.Rating.apply(lambda rating: 1 if rating > 3 else 0)

In [13]:
amazon

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,positively_rated
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0,1
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0,1
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0,1
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0,1
...,...,...,...,...,...,...,...
413832,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,4,good rugged phone that has a long-lasting batt...,0.0,1
413834,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,1,used hard,0.0,0
413835,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,5,another great deal great price,0.0,1
413837,Samsung Convoy U640 Phone for Verizon Wireless...,Samsung,79.95,5,Passes every drop test onto porcelain tile!,0.0,1


In [14]:
amazon.shape

(308277, 7)

In [15]:
amazon.positively_rated.mean()

0.7482686025879323

In [17]:
# mean of positively_rated shows that we have imbalanced classes
# That is, the distribution is skewed
# There are more data points in the positively rated class than in the non-positively rated class.

In [18]:
train_x, test_x, train_labs, test_labs = train_test_split(amazon.Reviews, amazon.positively_rated, random_state = 0)

In [21]:
train_x[0]

"I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!"

In [23]:
train_x.shape

(231207,)

In [24]:
# We need to convert this text data into a numeric representation that Sklearn can use.
# Bag of words approach is a commonly used basic technique, that ignores structure and just counts the frequency of words.

# count vectorizer converts a collection of text documents into a matrix of token counts, using the bag-of-words approach

In [26]:
# CountVectorizer tokenizes each document by finding all sequences of characters (?words)
# separated by word boundaries
# converts everything to lowercase
# and builds a vocabulary using these tokens.

In [27]:
vector = CountVectorizer().fit(train_x)

In [31]:
# This vocabulary was built using all tokens from the training data.

vector.get_feature_names_out()[::1000]

array(['00', '2048those', '4less', '99303', 'adr6275', 'andentering',
       'assignment', 'bandwidth', 'blazingly', 'bullets', 'cassettes',
       'cleary', 'condishion', 'cpl', 'debi', 'deғιnιтely',
       'dollarsshipping', 'ele', 'esteem', 'eyeglasses', 'flashy',
       'fusion2', 'gorila', 'hasbro', 'human', 'inefficiencies', 'irullu',
       'kinds', 'like', 'makeup', 'microsaudered', 'msgi', 'nightmarish',
       'oldy', 'p770', 'phalet', 'poori', 'productsaid', 'quirky',
       'rediculoius', 'responseive', 'sadness', 'send', 'sir', 'sos',
       'storecons', 'synch', 'them', 'trace', 'unconditional', 'utiles',
       'waiste', 'withstanding', 'zeis'], dtype=object)

In [32]:
len(vector.get_feature_names_out())

53216

In [33]:
# We will be working with a whopping 53,216 features.

In [34]:
# Next, transform this vocabulary vector into a matrix (SciPy sparse matrix)
# where, each row corresponds to a document and each column corresponds to a word from our vocabulary

In [41]:
print(f"So, expect the dimension of the matrix to be {train_x.shape[0]} x {len(vector.get_feature_names_out())} shaped")

So, expect the dimension of the matrix to be 231207 x 53216 shaped


In [36]:
train_x_vectorized = vector.transform(train_x)

In [38]:
train_x_vectorized

<231207x53216 sparse matrix of type '<class 'numpy.int64'>'
	with 6117776 stored elements in Compressed Sparse Row format>

In [43]:
# Entries in this matrix are the frequency of each word in each document.
# Since the number of words in most reviews are far too smaller than the number of words in the vocabulary, most entries in this 
# matrix are 0.

In [46]:
lreg = LogisticRegression(max_iter = 100_000)

In [47]:
lreg.fit(train_x_vectorized, train_labs)

In [49]:
# Transform the test data using the vectorizer that was fitted to the training data.
# Any words in test_x that weren't in train_x will just be ignored.

predictions = lreg.predict(vector.transform(test_x))

print(f"AOC score is {roc_auc_score(test_labs, predictions)}")

AOC score is 0.9305195681511472
