In [1]:
import pandas as pd
import numpy as np
from textstat.textstat import textstat

In [2]:
data = pd.read_csv('amazon_reviews.txt', delimiter='\t')
data['LABEL'] = data['LABEL'].map({'__label1__': 'fake', '__label2__': 'real'})
data['LABEL_INT'] = data['LABEL'].map({'fake': 1, 'real': 0})
data['VERIFIED_PURCHASE_INT'] = data['VERIFIED_PURCHASE'].map({'Y': 1, 'N': 0})

data

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,LABEL_INT,VERIFIED_PURCHASE_INT
0,1,fake,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...",1,0
1,2,fake,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...,1,1
2,3,fake,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...,1,0
3,4,fake,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...,1,0
4,5,fake,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...,1,0
5,6,fake,3,N,Health & Personal Care,B00686HNUK,Tobacco Pipe Stand - Fold-away Portable - Ligh...,not sure,I'm not sure what this is supposed to be but I...,1,0
6,7,fake,4,N,Toys,B00NUG865W,ESPN 2-Piece Table Tennis,PING PONG TABLE GREAT FOR YOUTHS AND FAMILY,Pleased with ping pong table. 11 year old and ...,1,0
7,8,fake,4,Y,Beauty,B00QUL8VX6,Abundant Health 25% Vitamin C Serum with Vitam...,Great vitamin C serum,Great vitamin C serum... I really like the oil...,1,1
8,9,fake,4,N,Health & Personal Care,B004YHKVCM,PODS Spring Meadow HE Turbo Laundry Detergent ...,wonderful detergent.,I've used tide pods laundry detergent for many...,1,0
9,10,fake,1,N,Health & Personal Care,B00H4IBD0M,"Sheer TEST, Best Testosterone Booster Suppleme...",WARNING: do not waste your money on this,Everybody wants to fall for their promises. Bu...,1,0


### Question 1 - Correlation

In [3]:
data[['LABEL_INT', 'PRODUCT_CATEGORY']].groupby('PRODUCT_CATEGORY').agg(['sum', 'count', 'mean'])

Unnamed: 0_level_0,LABEL_INT,LABEL_INT,LABEL_INT
Unnamed: 0_level_1,sum,count,mean
PRODUCT_CATEGORY,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Apparel,350,700,0.5
Automotive,350,700,0.5
Baby,350,700,0.5
Beauty,350,700,0.5
Books,350,700,0.5
Camera,350,700,0.5
Electronics,350,700,0.5
Furniture,350,700,0.5
Grocery,350,700,0.5
Health & Personal Care,350,700,0.5


In [4]:
data['LABEL_INT'].corr(data['VERIFIED_PURCHASE_INT'])

-0.56981624262119279

In [5]:
data['LABEL_INT'].corr(data['RATING'])

-0.0097972205512207866

In [6]:
data[['LABEL_INT', 'RATING']].groupby('RATING').agg(['sum', 'count', 'mean'])

Unnamed: 0_level_0,LABEL_INT,LABEL_INT,LABEL_INT
Unnamed: 0_level_1,sum,count,mean
RATING,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,889,1757,0.505976
2,627,1192,0.526007
3,926,1868,0.495717
4,1999,3973,0.503146
5,6059,12210,0.496233


### Question 2 - 

In [57]:
data['REVIEW_COUNT'] = data['REVIEW_TEXT'].str.count(r'\w+')

In [59]:
data[['LABEL', 'REVIEW_COUNT']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,REVIEW_COUNT,REVIEW_COUNT,REVIEW_COUNT
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,61.050476,60.870686
real,10500,81.65381,109.870801


In [76]:
data['READABILITY'] = data['REVIEW_TEXT'].apply(textstat.flesch_reading_ease)

In [77]:
data[['LABEL', 'READABILITY']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,READABILITY,READABILITY,READABILITY
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,79.759028,13.044638
real,10500,79.029707,13.18713


In [99]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

def count_stopwords(text):
    c = 0
    for word in text.split():
        if word in stopWords:
            c += 1
    return c

In [100]:
data['STOPWORDS_COUNT'] = data['REVIEW_TEXT'].apply(count_stopwords)

In [101]:
data[['LABEL', 'STOPWORDS_COUNT']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,STOPWORDS_COUNT,STOPWORDS_COUNT,STOPWORDS_COUNT
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,24.69619,24.325351
real,10500,32.519048,43.813539


In [102]:
data['STOPWORDS_RATIO'] = data['STOPWORDS_COUNT'] / data['REVIEW_COUNT']

In [103]:
data[['LABEL', 'STOPWORDS_RATIO']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,STOPWORDS_RATIO,STOPWORDS_RATIO,STOPWORDS_RATIO
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,0.40198,0.07637
real,10500,0.393652,0.077381


In [92]:
from string import punctuation

def count_punctuation(text):
    c = 0
    for word in text:
        if word in punctuation:
            c += 1
    return c

In [93]:
data['PUNCTUATION_COUNT'] = data['REVIEW_TEXT'].apply(count_punctuation)

In [94]:
data[['LABEL', 'PUNCTUATION_COUNT']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,PUNCTUATION_COUNT,PUNCTUATION_COUNT,PUNCTUATION_COUNT
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,10.182571,15.482145
real,10500,15.571524,25.888301


In [95]:
data['PUNCTUATION_RATIO'] = data['PUNCTUATION_COUNT'] / data['REVIEW_COUNT']

In [96]:
data[['LABEL', 'PUNCTUATION_RATIO']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,PUNCTUATION_RATIO,PUNCTUATION_RATIO,PUNCTUATION_RATIO
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,0.157543,0.091244
real,10500,0.178093,0.144681


In [109]:
def count_upper(text):
    return sum(1 for char in text if char.isupper())

In [110]:
data['UPPER_COUNT'] = data['REVIEW_TEXT'].apply(count_upper)
data[['LABEL', 'UPPER_COUNT']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,UPPER_COUNT,UPPER_COUNT,UPPER_COUNT
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,8.712667,24.175636
real,10500,12.099905,27.639396


In [111]:
data['UPPER_RATIO'] = data['UPPER_COUNT'] / data['REVIEW_COUNT']
data[['LABEL', 'UPPER_RATIO']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,UPPER_RATIO,UPPER_RATIO,UPPER_RATIO
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,0.136546,0.222843
real,10500,0.150311,0.262332


In [121]:
def name_in_text(row):
    text, title = row
    name_list = title.split()
    for name in name_list:
        if name in text:
            return 1
    return 0

In [124]:
data['IS_NAME_IN_TEXT'] = data[['REVIEW_TEXT', 'PRODUCT_TITLE']].apply(name_in_text, axis=1)
data[['LABEL', 'IS_NAME_IN_TEXT']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,IS_NAME_IN_TEXT,IS_NAME_IN_TEXT,IS_NAME_IN_TEXT
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,0.456,0.498084
real,10500,0.448381,0.497352


### Question 3 - Sentiment analysis

In [126]:
from textblob import TextBlob

In [127]:
def sentiment_score(text):
    tb = TextBlob(text)
    return tb.sentiment.polarity

In [129]:
data['SENTIMENT_SCORE'] = data['REVIEW_TEXT'].apply(sentiment_score)
data[['LABEL', 'SENTIMENT_SCORE']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,SENTIMENT_SCORE,SENTIMENT_SCORE,SENTIMENT_SCORE
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,0.261757,0.234817
real,10500,0.233616,0.228647


#### my sentiment classifier is in ex1A_5+B_3