In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix

In [2]:
def get_data(bigdf,f_sentiment,f_product,f_reviewc):
    map_features = {
        'sentiment': ['sentiment_bert_raw','negative_word','positive_word'],
        'product'  : ['product_avg_rating','product_std_rating','rating_diff'],
        'review_centric' : ['review_rating','question','links','word_count','product_name_mention','num_mentioned_brand','brand_mention'],
    }
    features = []
    fname = ""
    if f_sentiment :
        features += map_features['sentiment']
        fname +='Sentiment '
    if f_product :
        features += map_features['product']
        fname += 'Product '
    if f_reviewc :
        features += map_features['review_centric']
        fname += 'RC '
    df = bigdf[features]
    data = pd.concat([df, label_spam], axis=1).reindex(label_spam.index)
    return data,fname

In [3]:
scaler = MinMaxScaler()
def preprocess_data(data):
    kontinu = ['word_count','rating_diff','product_avg_rating','num_mentioned_brand','review_text_length','negative_word','positive_word','product_std_rating']
    X = data.drop(['is_spam'],axis=1).copy()
    for col in X:
        if col in kontinu:
            X[col] = scaler.fit_transform(X[col].values.reshape(-1,1))
        else:
            temp = pd.get_dummies(X[col]).iloc[:,:-1]
            temp.columns = [str(i)+'_'+col for i in temp.columns]
            X = pd.concat([X,temp],axis=1).drop(columns=[col]) #avoiding dummy variable trap
    y = data['is_spam'].copy()
    return X,y

In [4]:
def view_results(X,y,fname,clf):
    pred = cross_val_predict(clf, X, y, cv=5)
    cr = classification_report(y, pred, output_dict=True)
    f1 =[cr['0']['f1-score'],cr['1']['f1-score']]
    tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
    return f1+[cr['accuracy'],cr['macro avg']['f1-score']]

In [5]:
bigdf = pd.read_csv('../Dataset/reviews_features.tsv',sep='\t').set_index('idx')
label_spam = pd.read_csv('../Dataset/review_spam_labeled.tsv',sep='\t').set_index('idx')
label_spam.columns = ['is_spam']

In [6]:
bigdf

Unnamed: 0_level_0,sentiment_bert_raw,negative_word,positive_word,product_avg_rating,product_std_rating,rating_diff,review_rating,question,links,word_count,product_name_mention,num_mentioned_brand,brand_mention,is_spam
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
9722,positive,0,3,3.64,0.771389,-0.64,3,no,no,33,yes,0,no,0.0
10212,positive,1,1,3.82,0.922611,-2.82,1,no,no,52,yes,0,no,1.0
10231,positive,7,12,3.82,0.922611,0.18,4,no,no,42,no,3,yes,1.0
10548,positive,6,7,3.30,1.100000,-1.30,2,no,no,26,no,0,no,1.0
10630,negative,4,3,3.98,1.108482,0.02,4,no,no,41,no,1,no,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4976,positive,4,4,4.11,0.566558,-0.11,4,no,no,35,no,3,no,0.0
7475,positive,5,3,3.86,0.914732,0.14,4,no,no,29,yes,2,no,0.0
11038,positive,4,4,4.28,0.825228,-0.28,4,no,no,72,yes,8,yes,0.0
3275,negative,2,6,3.14,1.186661,-0.14,3,no,no,52,yes,20,no,0.0


In [7]:
combination = []
for i in [False, True]:
    for j in [False, True]:
        for k in [False, True]:
            combination.append([i,j,k])
del combination[0]
combination #sentiment // product // RC

[[False, False, True],
 [False, True, False],
 [False, True, True],
 [True, False, False],
 [True, False, True],
 [True, True, False],
 [True, True, True]]

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
# from sklearn.svm import LinearSVC
# from sklearn.naive_bayes import GaussianNB
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble import RandomForestClassifier
# # from sklearn.tree import DecisionTreeClassifier
# from sklearn.neural_network import MLPClassifier
# from sklearn.neighbors import KNeighborsClassifier

In [9]:
models = {
    'Logistic Regression' : LogisticRegression(random_state=11),
    'SVC rbf': SVC( kernel='rbf',C=2, probability=True, random_state=42),
#     'Decision Tree': DecisionTreeClassifier(random_state=0),
#     'Naive Bayes' : GaussianNB(),
#     'Random Forest': RandomForestClassifier(max_depth=3, random_state=0)
}
for key, clf in models.items():
    print('Model :',key)
    results = {}
    for c in combination:
        data,fname = get_data(bigdf, c[0],c[1],c[2])
        X, y = preprocess_data(data)
        results[fname] = view_results(X,y,fname,clf)
    results = pd.DataFrame.from_dict(results,orient='index',columns=['F1-Score 0','F1-Score 1','Accuracy','macro-F1'])
    results.to_csv(key+'_results.csv')
    display(results)

Model : Logistic Regression


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,F1-Score 0,F1-Score 1,Accuracy,macro-F1
RC,0.858537,0.325581,0.766129,0.592059
Product,0.864662,0.181818,0.767742,0.52324
Product RC,0.859127,0.387931,0.770968,0.623529
Sentiment,0.858195,0.0,0.751613,0.429098
Sentiment RC,0.841285,0.234742,0.737097,0.538014
Sentiment Product,0.865728,0.182857,0.769355,0.524292
Sentiment Product RC,0.856578,0.366812,0.766129,0.611695


Model : SVC rbf


Unnamed: 0,F1-Score 0,F1-Score 1,Accuracy,macro-F1
RC,0.846377,0.22439,0.743548,0.535384
Product,0.864491,0.401747,0.779032,0.633119
Product RC,0.857988,0.362832,0.767742,0.61041
Sentiment,0.856879,0.012739,0.75,0.434809
Sentiment RC,0.885496,0.674923,0.830645,0.780209
Sentiment Product,0.872763,0.452991,0.793548,0.662877
Sentiment Product RC,0.885496,0.674923,0.830645,0.780209
