# Part A

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.classify import SklearnClassifier, accuracy
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import KFold, train_test_split
from sklearn.svm import LinearSVC
import math
import numpy as np
import string

In [2]:
data = pd.read_csv('amazon_reviews.txt', delimiter='\t')
data['LABEL'] = data['LABEL'].map({'__label1__': 'fake', '__label2__': 'real'})
data

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,fake,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
1,2,fake,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
2,3,fake,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
3,4,fake,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
4,5,fake,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...
5,6,fake,3,N,Health & Personal Care,B00686HNUK,Tobacco Pipe Stand - Fold-away Portable - Ligh...,not sure,I'm not sure what this is supposed to be but I...
6,7,fake,4,N,Toys,B00NUG865W,ESPN 2-Piece Table Tennis,PING PONG TABLE GREAT FOR YOUTHS AND FAMILY,Pleased with ping pong table. 11 year old and ...
7,8,fake,4,Y,Beauty,B00QUL8VX6,Abundant Health 25% Vitamin C Serum with Vitam...,Great vitamin C serum,Great vitamin C serum... I really like the oil...
8,9,fake,4,N,Health & Personal Care,B004YHKVCM,PODS Spring Meadow HE Turbo Laundry Detergent ...,wonderful detergent.,I've used tide pods laundry detergent for many...
9,10,fake,1,N,Health & Personal Care,B00H4IBD0M,"Sheer TEST, Best Testosterone Booster Suppleme...",WARNING: do not waste your money on this,Everybody wants to fall for their promises. Bu...


In [3]:
stemmer = SnowballStemmer('english')
stopWords = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
lmtzr = WordNetLemmatizer()

def preProcess(text):
    tokens = tokenizer.tokenize(text)
#     result = [stemmer.stem(t) for t in tokens if t not in stopWords]
    result = [lmtzr.lemmatize(t) for t in tokens if t not in stopWords]
    
    return result

# vectorizer = CountVectorizer()

featureDict = {} # A global dictionary of features

# def toFeatureVector(row):
#     # Should return a dictionary containing features as keys, and weights as values
    
#     tokens = row[0]

#     feature_vector = {}
#     for token in tokens:
#         if token in feature_vector:
#             feature_vector[token] += 1
#         else:
#             feature_vector[token] = 1
        
#         if token in featureDict:
#             featureDict[token] += 1
#         else:
#             featureDict[token] = 1
            
#     for i in range(len(tokens)-1):
#         token = (tokens[i] + ' ' + tokens[i+1])
#         if token in feature_vector:
#             feature_vector[token] += 1
#         else:
#             feature_vector[token] = 1
        
#         if token in featureDict:
#             featureDict[token] += 1
#         else:
#             featureDict[token] = 1
    
#     if(len(row)>1):
#         feature_vector['RATING'] = row[1]*2
#         feature_vector['VERIFIED_PURCHASE'] = row[2]
#         feature_vector['PRODUCT_CATEGORY'] = row[3]
            
#     return feature_vector

def toFeatureVector(row):
    # Should return a dictionary containing features as keys, and weights as values
    
    tokens = row[0]

    feature_vector = {}
    for token in tokens:
        if token in feature_vector:
            feature_vector[token] += 1
        else:
            feature_vector[token] = 1
        
        # implement TFIDF
        if feature_vector[token] == 1:
            if token in featureDict:
                featureDict[token] += 1
            else:
                featureDict[token] = 1
    
    for i in range(len(tokens)-1):
        token = (tokens[i] + ' ' + tokens[i+1])
        if token in feature_vector:
            feature_vector[token] += 1
        else:
            feature_vector[token] = 1
        
        if feature_vector[token] == 1:
            if token in featureDict:
                featureDict[token] += 1
            else:
                featureDict[token] = 1
            
    return feature_vector

review_count = data.shape[0]

def tfidf(row):
    result = {}
    
    featureVector = row[0]
    for token, count in featureVector.items():
        result[token] = (1 + math.log10(count))*math.log10(review_count/featureDict[token])
    
    
    if(len(row)>1):
        result['RATING'] = row[1]*2
        result['VERIFIED_PURCHASE'] = row[2]
        result['PRODUCT_CATEGORY'] = row[3]
    
    return result

In [5]:
data['TOKEN'] = data['REVIEW_TEXT'].apply(preProcess)
# review_vector = data[['TOKEN', 'RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY']].apply(toFeatureVector, axis=1)

data['FEATURE_VECTOR'] = data[['TOKEN']].apply(toFeatureVector, axis=1)
review_vector = data[['FEATURE_VECTOR', 'RATING', 'VERIFIED_PURCHASE', 'PRODUCT_CATEGORY']].apply(tfidf, axis=1)

In [16]:
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(loss='hinge', max_iter=3000, C=1))])
    return SklearnClassifier(pipeline).train(trainData)

def predictLabels_cv(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

# def predictLabels(reviewSamples, classifier):
#     return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

# def predictLabel(reviewSample, classifier):
#     return classifier.classify(toFeatureVector(preProcess(reviewSample)))


def crossValidate(dataset, folds):
#     shuffle(dataset)
#     foldSize = math.ceil(len(dataset)/folds)

    cv_results = []
    kf = KFold(n_splits=folds, shuffle=True)
    
    scores = np.array([0,0,0,0])
    for train_index, test_index in kf.split(dataset):
        X_train, X_test = dataset[train_index], dataset[test_index]
        classifier = trainClassifier(X_train)
        y_pred = predictLabels_cv(X_test, classifier)
        y_true = X_test[:, 1]
        
        acc = accuracy(classifier, X_test)
        prfs = precision_recall_fscore_support(y_true, y_pred, average='weighted')
        scores = scores + np.array([prfs[0], prfs[1], prfs[2], acc])

    scores = scores / folds
    cv_results = {'precision': scores[0], 'recall': scores[1],
                  'f1': scores[2], 'accuracy': scores[3]}
    return cv_results

In [17]:
data['REVIEW_VOCTOR'] = review_vector

selected_data = data[['REVIEW_VOCTOR', 'LABEL']]
selected_data

Unnamed: 0,REVIEW_VOCTOR,LABEL
0,"{'When': 1.5039934011199638, 'least': 1.734508...",fake
1,"{'Lithium': 3.720159303405957, 'battery': 1.94...",fake
2,"{'I': 0.19942450850459625, 'purchased': 1.3827...",fake
3,"{'I': 0.19942450850459625, 'looking': 1.233374...",fake
4,"{'I': 0.19942450850459625, 'use': 1.0408708038...",fake
5,"{'I': 0.17565062218556182, 'sure': 1.369426851...",fake
6,"{'Pleased': 3.322219294733919, 'ping': 3.54406...",fake
7,"{'Great': 1.2912150133703824, 'vitamin': 2.581...",fake
8,"{'I': 0.13500889508386657, 'used': 1.084424301...",fake
9,"{'Everybody': 3.720159303405957, 'want': 1.194...",fake


In [18]:
train_data, testData = train_test_split(selected_data, test_size=0.2)

# train_data = selected_data.values
crossValidate(train_data.values, 10)

Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...


{'accuracy': 0.8102976190476191,
 'f1': 0.81013717605128144,
 'precision': 0.81157881825272116,
 'recall': 0.8102976190476191}

### Testing on test dataset

In [19]:
classifier = trainClassifier(train_data.values)
y_pred = predictLabels_cv(testData.values, classifier)
y_true = testData.values[:, 1]

acc = accuracy(classifier, testData.values)
prfs = precision_recall_fscore_support(y_true, y_pred, average='weighted')

{'precision': prfs[0], 'recall': prfs[1], 'f1': prfs[2], 'accuracy': acc}

Training Classifier...


{'accuracy': 0.8135714285714286,
 'f1': 0.81343903542773155,
 'precision': 0.81457235729021504,
 'recall': 0.81357142857142861}

# Part B 

In [20]:
from textstat.textstat import textstat

In [21]:
# data = pd.read_csv('amazon_reviews.txt', delimiter='\t')
# data['LABEL'] = data['LABEL'].map({'__label1__': 'fake', '__label2__': 'real'})
data['LABEL_INT'] = data['LABEL'].map({'fake': 1, 'real': 0})
data['VERIFIED_PURCHASE_INT'] = data['VERIFIED_PURCHASE'].map({'Y': 1, 'N': 0})

data

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT,TOKEN,FEATURE_VECTOR,REVIEW_VOCTOR,LABEL_INT,VERIFIED_PURCHASE_INT
0,1,fake,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...","[When, least, think, product, save, day, Just,...","{'When': 1, 'least': 1, 'think': 1, 'product':...","{'When': 1.5039934011199638, 'least': 1.734508...",1,0
1,2,fake,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...,"[Lithium, battery, something, new, introduced,...","{'Lithium': 1, 'battery': 2, 'something': 1, '...","{'Lithium': 3.720159303405957, 'battery': 1.94...",1,1
2,3,fake,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...,"[I, purchased, swing, baby, She, 6, month, pre...","{'I': 3, 'purchased': 1, 'swing': 2, 'baby': 1...","{'I': 0.19942450850459625, 'purchased': 1.3827...",1,0
3,4,fake,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...,"[I, looking, inexpensive, desk, calcolatur, It...","{'I': 3, 'looking': 1, 'inexpensive': 1, 'desk...","{'I': 0.19942450850459625, 'looking': 1.233374...",1,0
4,5,fake,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...,"[I, use, twice, week, result, great, I, used, ...","{'I': 3, 'use': 2, 'twice': 1, 'week': 2, 'res...","{'I': 0.19942450850459625, 'use': 1.0408708038...",1,0
5,6,fake,3,N,Health & Personal Care,B00686HNUK,Tobacco Pipe Stand - Fold-away Portable - Ligh...,not sure,I'm not sure what this is supposed to be but I...,"[I, sure, supposed, I, would, recommend, littl...","{'I': 2, 'sure': 1, 'supposed': 1, 'would': 1,...","{'I': 0.17565062218556182, 'sure': 1.369426851...",1,0
6,7,fake,4,N,Toys,B00NUG865W,ESPN 2-Piece Table Tennis,PING PONG TABLE GREAT FOR YOUTHS AND FAMILY,Pleased with ping pong table. 11 year old and ...,"[Pleased, ping, pong, table, 11, year, old, 13...","{'Pleased': 1, 'ping': 1, 'pong': 1, 'table': ...","{'Pleased': 3.322219294733919, 'ping': 3.54406...",1,0
7,8,fake,4,Y,Beauty,B00QUL8VX6,Abundant Health 25% Vitamin C Serum with Vitam...,Great vitamin C serum,Great vitamin C serum... I really like the oil...,"[Great, vitamin, C, serum, I, really, like, oi...","{'Great': 1, 'vitamin': 1, 'C': 1, 'serum': 1,...","{'Great': 1.2912150133703824, 'vitamin': 2.581...",1,1
8,9,fake,4,N,Health & Personal Care,B004YHKVCM,PODS Spring Meadow HE Turbo Laundry Detergent ...,wonderful detergent.,I've used tide pods laundry detergent for many...,"[I, used, tide, pod, laundry, detergent, many,...","{'I': 1, 'used': 1, 'tide': 1, 'pod': 1, 'laun...","{'I': 0.13500889508386657, 'used': 1.084424301...",1,0
9,10,fake,1,N,Health & Personal Care,B00H4IBD0M,"Sheer TEST, Best Testosterone Booster Suppleme...",WARNING: do not waste your money on this,Everybody wants to fall for their promises. Bu...,"[Everybody, want, fall, promise, But, relative...","{'Everybody': 1, 'want': 1, 'fall': 1, 'promis...","{'Everybody': 3.720159303405957, 'want': 1.194...",1,0


### Question 1 - Correlation

In [22]:
data[['LABEL_INT', 'PRODUCT_CATEGORY']].groupby('PRODUCT_CATEGORY').agg(['sum', 'count', 'mean'])

Unnamed: 0_level_0,LABEL_INT,LABEL_INT,LABEL_INT
Unnamed: 0_level_1,sum,count,mean
PRODUCT_CATEGORY,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Apparel,350,700,0.5
Automotive,350,700,0.5
Baby,350,700,0.5
Beauty,350,700,0.5
Books,350,700,0.5
Camera,350,700,0.5
Electronics,350,700,0.5
Furniture,350,700,0.5
Grocery,350,700,0.5
Health & Personal Care,350,700,0.5


In [23]:
data['LABEL_INT'].corr(data['VERIFIED_PURCHASE_INT'])

-0.56981624262119279

In [24]:
data['LABEL_INT'].corr(data['RATING'])

-0.0097972205512207866

In [25]:
data[['LABEL_INT', 'RATING']].groupby('RATING').agg(['sum', 'count', 'mean'])

Unnamed: 0_level_0,LABEL_INT,LABEL_INT,LABEL_INT
Unnamed: 0_level_1,sum,count,mean
RATING,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,889,1757,0.505976
2,627,1192,0.526007
3,926,1868,0.495717
4,1999,3973,0.503146
5,6059,12210,0.496233


### Question 2

In [26]:
data['REVIEW_COUNT'] = data['REVIEW_TEXT'].str.count(r'\w+')
data[['LABEL', 'REVIEW_COUNT']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,REVIEW_COUNT,REVIEW_COUNT,REVIEW_COUNT
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,61.050476,60.870686
real,10500,81.65381,109.870801


In [27]:
data['READABILITY'] = data['REVIEW_TEXT'].apply(textstat.flesch_reading_ease)
data[['LABEL', 'READABILITY']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,READABILITY,READABILITY,READABILITY
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,79.759028,13.044638
real,10500,79.029707,13.18713


In [28]:
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

def count_stopwords(text):
    c = 0
    for word in text.split():
        if word in stopWords:
            c += 1
    return c

In [29]:
data['STOPWORDS_COUNT'] = data['REVIEW_TEXT'].apply(count_stopwords)
data[['LABEL', 'STOPWORDS_COUNT']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,STOPWORDS_COUNT,STOPWORDS_COUNT,STOPWORDS_COUNT
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,24.69619,24.325351
real,10500,32.519048,43.813539


In [30]:
data['STOPWORDS_RATIO'] = data['STOPWORDS_COUNT'] / data['REVIEW_COUNT']
data[['LABEL', 'STOPWORDS_RATIO']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,STOPWORDS_RATIO,STOPWORDS_RATIO,STOPWORDS_RATIO
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,0.40198,0.07637
real,10500,0.393652,0.077381


In [31]:
from string import punctuation

def count_punctuation(text):
    c = 0
    for word in text:
        if word in punctuation:
            c += 1
    return c

In [32]:
data['PUNCTUATION_COUNT'] = data['REVIEW_TEXT'].apply(count_punctuation)
data[['LABEL', 'PUNCTUATION_COUNT']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,PUNCTUATION_COUNT,PUNCTUATION_COUNT,PUNCTUATION_COUNT
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,10.182571,15.482145
real,10500,15.571524,25.888301


In [33]:
data['PUNCTUATION_RATIO'] = data['PUNCTUATION_COUNT'] / data['REVIEW_COUNT']
data[['LABEL', 'PUNCTUATION_RATIO']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,PUNCTUATION_RATIO,PUNCTUATION_RATIO,PUNCTUATION_RATIO
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,0.157543,0.091244
real,10500,0.178093,0.144681


In [34]:
def count_upper(text):
    return sum(1 for char in text if char.isupper())

In [35]:
data['UPPER_COUNT'] = data['REVIEW_TEXT'].apply(count_upper)
data[['LABEL', 'UPPER_COUNT']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,UPPER_COUNT,UPPER_COUNT,UPPER_COUNT
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,8.712667,24.175636
real,10500,12.099905,27.639396


In [36]:
data['UPPER_RATIO'] = data['UPPER_COUNT'] / data['REVIEW_COUNT']
data[['LABEL', 'UPPER_RATIO']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,UPPER_RATIO,UPPER_RATIO,UPPER_RATIO
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,0.136546,0.222843
real,10500,0.150311,0.262332


In [37]:
def name_in_text(row):
    text, title = row
    name_list = title.split()
    for name in name_list:
        if name in text:
            return 1
    return 0

In [38]:
data['IS_NAME_IN_TEXT'] = data[['REVIEW_TEXT', 'PRODUCT_TITLE']].apply(name_in_text, axis=1)
data[['LABEL', 'IS_NAME_IN_TEXT']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,IS_NAME_IN_TEXT,IS_NAME_IN_TEXT,IS_NAME_IN_TEXT
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,0.456,0.498084
real,10500,0.448381,0.497352


### Question 3 - Sentiment Analysis

In [39]:
data['SENTIMENT_FROM_RATING'] = data['RATING'].map({5: 'positive', 4: 'positive', 3: 'neutral', 2: 'negative', 1: 'negative'}) 

In [40]:
print('positive:', (data['SENTIMENT_FROM_RATING'] == 'positive').sum())
print('negative:', (data['SENTIMENT_FROM_RATING'] == 'negative').sum())
print('neutral:', (data['SENTIMENT_FROM_RATING'] == 'neutral').sum())

positive: 16183
negative: 2949
neutral: 1868


In [41]:
selected_data2 = data[['REVIEW_VOCTOR', 'SENTIMENT_FROM_RATING']]
selected_data2

Unnamed: 0,REVIEW_VOCTOR,SENTIMENT_FROM_RATING
0,"{'When': 1.5039934011199638, 'least': 1.734508...",positive
1,"{'Lithium': 3.720159303405957, 'battery': 1.94...",positive
2,"{'I': 0.19942450850459625, 'purchased': 1.3827...",neutral
3,"{'I': 0.19942450850459625, 'looking': 1.233374...",positive
4,"{'I': 0.19942450850459625, 'use': 1.0408708038...",positive
5,"{'I': 0.17565062218556182, 'sure': 1.369426851...",neutral
6,"{'Pleased': 3.322219294733919, 'ping': 3.54406...",positive
7,"{'Great': 1.2912150133703824, 'vitamin': 2.581...",positive
8,"{'I': 0.13500889508386657, 'used': 1.084424301...",positive
9,"{'Everybody': 3.720159303405957, 'want': 1.194...",negative


In [42]:
train_data, testData = train_test_split(selected_data2, test_size=0.2)
crossValidate(train_data.values, 10)

# crossValidate(selected_data2.values, 10)

Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...


{'accuracy': 0.9180357142857144,
 'f1': 0.8908175525753077,
 'precision': 0.90809010831628767,
 'recall': 0.9180357142857144}

In [43]:
classifier = trainClassifier(train_data.values)
y_pred = predictLabels_cv(testData.values, classifier)
y_true = testData.values[:, 1]

acc = accuracy(classifier, testData.values)
prfs = precision_recall_fscore_support(y_true, y_pred, average='weighted')

{'precision': prfs[0], 'recall': prfs[1], 'f1': prfs[2], 'accuracy': acc}

Training Classifier...


{'accuracy': 0.9154761904761904,
 'f1': 0.88869568851928615,
 'precision': 0.90283824662183854,
 'recall': 0.91547619047619044}

#### Testing with TextBlob sentiment analysis

In [44]:
from textblob import TextBlob

def sentiment_score(text):
    tb = TextBlob(text)
    return tb.sentiment.polarity

In [45]:
data['SENTIMENT_SCORE'] = data['REVIEW_TEXT'].apply(sentiment_score)
data[['LABEL', 'SENTIMENT_SCORE']].groupby('LABEL').agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,SENTIMENT_SCORE,SENTIMENT_SCORE,SENTIMENT_SCORE
Unnamed: 0_level_1,count,mean,std
LABEL,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
fake,10500,0.261757,0.234817
real,10500,0.233616,0.228647
