# Text classification (sentiment analysis)
Task: Predict sentiment of Amazon reviews
Dataset: Beans from TFDS

## 1. Loading dataset & basic preprocessing
- removal of reviews shorter than 5 characters
- mapping from 1-5 -> 0,1,2
- subsampling - without replacement, random state 42, 80 000 rows

In [201]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from IPython.display import display
import re
import matplotlib.pyplot as plt
import nltk

In [189]:
df = pd.read_csv('datasets/amazon_reviews_us_Major_Appliances_v1_00.tsv', sep='\t', on_bad_lines='skip')

In [190]:
df.shape

(96834, 15)

In [191]:
# remove nas and duplicate reviews
df.dropna(axis=0, subset=['review_body'], inplace=True)
df.drop_duplicates(subset=['review_body'], inplace=True)

In [192]:
df.shape

(93446, 15)

In [193]:
stopword_list = nltk.corpus.stopwords.words("english")

In [194]:
def remove_tags(review):
    return re.sub(pattern='<.*?>', string=review , repl=' ') 

def keep_alnum(review):
    return re.sub(pattern='[^A-Za-z\d\s:]', string=review, repl=' ')

def strip_spaces(review):
    return re.sub(pattern='[\s]{2,}', string=review, repl=' ')

def lowercase(review):
    return review.lower()

def remove_stopwords(review):
    review_list = review.split()
    return " ".join([word for word in review_list if word not in stopword_list])

In [195]:
df['review_body'] = df['review_body'].apply(remove_tags) # removes html tags
df['review_body'] = df['review_body'].apply(keep_alnum) # removes sub unicode char
df['review_body'] = df['review_body'].apply(strip_spaces) # strip all unnecessary whitespaces
df['review_body'] = df['review_body'].apply(lowercase) # put everything into lowercase
df['review_body'] = df['review_body'].apply(remove_stopwords) # put everything into lowercase
df = df[df['review_body'].str.len() > 5]               # keep only reviews longer than 5 characters


In [196]:
df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,16199106,R203HPW78Z7N4K,B0067WNSZY,633038551,"FGGF3032MW Gallery Series 30"" Wide Freestandin...",Major Appliances,5,0,0,N,Y,"If you need a new stove, this is a winner.",great stove wonderful replacement sort antique...,2015-08-31
1,US,16374060,R2EAIGVLEALSP3,B002QSXK60,811766671,Best Hand Clothes Wringer,Major Appliances,5,1,1,N,Y,Five Stars,worked great,2015-08-31
2,US,15322085,R1K1CD73HHLILA,B00EC452R6,345562728,Supco SET184 Thermal Cutoff Kit,Major Appliances,5,0,0,N,Y,Fast Shipping,part exactly needed saved purchasing,2015-08-31
3,US,32004835,R2KZBMOFRMYOPO,B00MVVIF2G,563052763,Midea WHS-160RB1 Compact Single Reversible Doo...,Major Appliances,5,1,1,N,Y,Five Stars,love refrigerator keeps everything cold recommend,2015-08-31
4,US,25414497,R6BIZOZY6UD01,B00IY7BNUW,874236579,Avalon Bay Portable Ice Maker,Major Appliances,5,0,0,N,Y,Five Stars,running store ice works perfectly,2015-08-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96829,US,37431087,R3CYIDM3UEY5PA,B00005O64S,222987122,Haier HDT18PA Space Saver Compact Countertop D...,Major Appliances,4,37,43,N,N,Pretty good dishwasher for small apartment,pretty good dishwasher price good job cleaning...,2002-07-14
96830,US,44686434,R1PLFLGSA6N9WU,B00005O64T,802734810,Haier America HSE02-WNAWW 1.8-Cubic-Foot Capac...,Major Appliances,1,33,39,N,N,Does not last long,bought office extremely dissatisfied stopped w...,2002-06-03
96831,US,36739731,RBPARLMOY6ZU5,B00005O64S,222987122,Haier HDT18PA Space Saver Compact Countertop D...,Major Appliances,5,6,45,N,N,Rave review for space saver,saw small dishwasher thought wonderful idea sm...,2002-05-05
96832,US,50744080,RSS5TDZOGUEB6,B00004SACT,344802997,Sanyo Two-Door 2.9 Cubic Foot Refrigerator,Major Appliances,4,71,71,N,N,Sanyo compact refrigerator,probably best small refrigerator market true f...,2000-09-29


In [197]:
df = df[['review_body', 'star_rating']]
df.loc[df['star_rating'] < 3, 'sentiment'] = 0
df.loc[df['star_rating'] == 3, 'sentiment'] = 1
df.loc[df['star_rating'] > 3, 'sentiment'] = 2
df.drop('star_rating', axis=1, inplace=True)
df = resample(df, n_samples=80000, random_state=42, replace=False)
print(df.shape)

(80000, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['star_rating'] < 3, 'sentiment'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('star_rating', axis=1, inplace=True)


In [198]:
df

Unnamed: 0,review_body,sentiment
71963,usually rate things hate exactly feel small lg...,0.0
15508,hard open must securely mounted provisions,1.0
3110,15 months stopped working threw groceries call...,0.0
88804,silver version tiny cooler fridge company chri...,2.0
18295,exactly needed get dryer back,2.0
...,...,...
62973,purchased replace space 12 34 garbage compacto...,2.0
45682,bought item professionally installed middle ap...,0.0
84940,purchased nice looking supposedly good name br...,0.0
1873,videoid:8829556f67d2453e377e6459465db27e first...,2.0


## 2. Final dataset
- 80 000 instances
- NEGATIVE 21 334
- NEUTRAL 5 674
- POSITIVE 52 992 
- 90:10 train:test split

In [199]:
df['sentiment'].value_counts()

sentiment
2.0    52992
0.0    21334
1.0     5674
Name: count, dtype: int64

In [202]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['review_body'], df['sentiment'], random_state=42, test_size=0.1, stratify=df['sentiment']
)

In [204]:
y_train.value_counts()

sentiment
2.0    47693
0.0    19200
1.0     5107
Name: count, dtype: int64

In [205]:
y_test.value_counts()

sentiment
2.0    5299
0.0    2134
1.0     567
Name: count, dtype: int64