In [212]:
#libraries
import pandas as pd
import sqlite3
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

In [180]:
#connecting sqlite database
conn = sqlite3.connect("C:/Users/Afzal Sufiya/Documents/MasterThesis/AmazonReviews1.db")
query = "SELECT * FROM reviews"
data = pd.read_sql_query(query, conn)
conn.close()

In [None]:
#creating new feature "review length"
data['review_length']=data['description'].str.len()
print(data.review_length)

In [None]:
#SMOTE
smote = SMOTE(random_state=42)

## Descriptive Analysis 

In [None]:
# Calculating summary statistics
da_an=data
stats1 = data.groupby('prediction')['review_length'].describe()
print(stats1)

stats2 = data.groupby('prediction')['rating'].describe()
print(stats2)


In [None]:
rat_pred = da_an.groupby(['rating', 'prediction']).size().unstack(fill_value=0)

# Plotting
fig, ax = plt.subplots()
bar_width = 0.35
index = rat_pred.index

bars_fake = ax.bar(index - bar_width/2, rat_pred[0], bar_width, label='Fake', color='dimgrey', edgecolor='black')
bars_real = ax.bar(index + bar_width/2, rat_pred[1], bar_width, label='Real', color='lightgrey', edgecolor='black')

# Adding values on top of bars
for bar in bars_fake:
    height = bar.get_height()
    ax.annotate(f'{height}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha='center', va='bottom')

for bar in bars_real:
    height = bar.get_height()
    ax.annotate(f'{height}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha='center', va='bottom')

ax.set_xlabel('Star Ratings')
ax.set_ylabel('Number of Reviews')
ax.set_title('Distribution of Fake and Real Reviews by Star Rating')
ax.legend()

plt.show()


In [None]:
cat_pred = da_an.groupby(['category', 'prediction']).size().unstack(fill_value=0)

# Plotting
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

cat_pred[0].plot(kind='pie', ax=ax1, autopct='%1.1f%%', colors=['lightgray'], wedgeprops={'edgecolor': 'black'})
ax1.set_title('Fake Reviews by Category')
ax1.set_ylabel('')

cat_pred[1].plot(kind='pie', ax=ax2, autopct='%1.1f%%', colors=['dimgray'], wedgeprops={'edgecolor': 'black'})
ax2.set_title('Real Reviews by Category')
ax2.set_ylabel('')

plt.show()

In [None]:
# category, subcategory, and preds plotting

fig, axes = plt.subplots(5, 2, figsize=(14, 20), sharey=True)
axes = axes.flatten()


## Feature Analysis

In [None]:
#function for regression table

def RegressionSummary(xs, ys):
    
    # fitting data into logit function
    logit_model = sm.Logit(ys, xs)
    result = logit_model.fit()

    # summary table
    summary_table = result.summary2().tables[1]
    summary_table = summary_table.sort_values(by='Coef.', ascending=False)
    
    return summary_table


In [191]:
#function to fit the data into models and extract feature importance

def FitModels_FeatureImportance(xs, ys, feature_names):

    # Initialize classifiers
    classifiers = {
        'logistic_regression': LogisticRegression(max_iter=10000),
        'random_forest': RandomForestClassifier(random_state=42),
        'extra_trees': ExtraTreesClassifier(),
        'xgboost': XGBClassifier()
    }
    
    # Fit classifiers
    for name, clf in classifiers.items():
        clf.fit(xs, ys)
    
    # Calculate feature importances
    coefficients = classifiers['logistic_regression'].coef_
    fp_lr = np.mean(np.abs(coefficients), axis=0)
    
    fp_rf = classifiers['random_forest'].feature_importances_
    fp_et = classifiers['extra_trees'].feature_importances_
    fp_xg = classifiers['xgboost'].feature_importances_
    
    # Create a DataFrame for feature importances
    fp = pd.DataFrame({
        'Feature': feature_names,
        'LogReg_Importance': fp_lr,
        'RandomForest_Importance': fp_rf,
        'ExtraTrees_Importance': fp_et,
        'XGBoost_Importance': fp_xg
    })
    
    # Calculating average feature importance
    fp['Average_Importance'] = fp[['LogReg_Importance', 'RandomForest_Importance', 'ExtraTrees_Importance', 'XGBoost_Importance']].mean(axis=1)
    
    # Sorting by average importance
    fp = fp.sort_values(by='Average_Importance', ascending=False)
    
    return fp

### Individual Features

#### "Category" feature

In [4]:
#preparing dataset for "Category" feature
data_cat = pd.get_dummies(data, columns=['category'], drop_first=False) # one-hot encoding
x_cat = data_cat.drop(columns= ['review_length', 'subcategory', 'rating', 'result', 'product', 'title', 'description', 'id', 'prediction']) #excluding non feature columns
y_cat = data_cat['prediction']

x_cat = x_cat.astype(np.float64)
y_cat = y_cat.astype(np.int64)

#implemnetation of SMOTE
xs_cat, ys_cat = smote.fit_resample(x_cat, y_cat)


In [None]:
#regression table
reg_cat = RegressionSummary(xs_cat, ys_cat)
display(reg_cat)

In [None]:
#extracting feature importance scores
fp_cat = FitModels_FeatureImportance(xs_cat, ys_cat, xs_cat.columns.tolist())
display(fp_cat)

In [None]:
#plot
plt.figure(figsize=(10, 8))
sns.barplot(x='Feature', y='Average_Importance', data=fp_cat, palette='flare')
plt.title('Average Feature Importance Across Different Categories')
plt.xlabel('Feature')
plt.ylabel('Average Importance')
plt.xticks(rotation=90)
plt.show()

#### "Subcategory" feature

In [188]:
#preparing dataset for "Subcategory" feature
data_sub = pd.get_dummies(data, columns=['subcategory'], drop_first=False) # one-hot encoding
x_sub = data_sub.drop(columns= ['review_length', 'category', 'rating', 'result', 'product', 'title', 'description', 'id', 'prediction']) #excluding non feature columns
y_sub = data_sub['prediction']

x_sub = x_sub.astype(np.float64)
y_sub = y_sub.astype(np.int64)


#implemnetation of SMOTE
xs_sub, ys_sub = smote.fit_resample(x_sub, y_sub)


In [None]:
#regression table
reg_sub = RegressionSummary(xs_sub, ys_sub)
display(reg_sub)

In [None]:
#extracting feature importance scores
fp_sub = FitModels_FeatureImportance(xs_sub, ys_sub, xs_sub.columns.tolist())
display(fp_sub)

In [None]:
#plotting

plt.figure(figsize=(10, 8))
sns.barplot(x='Feature', y='Average_Importance', data=fp_sub, palette='flare')
plt.title('Average Feature Importance Across Different Sub-Categories')
plt.xlabel('Feature')
plt.ylabel('Average Importance')
plt.xticks(rotation=90)
plt.show()


### Feature Interactions

#### 1. "review length" & "ratings" feature interaction

In [194]:
#preparing dataset for "Review Length" & "Ratings" feature
data_rr=data
x_rr = data_rr.drop(columns= ['subcategory', 'category', 'result', 'product', 'title', 'description', 'id', 'prediction']) #excluding non feature columns

#scalling the numerical features
scaler = StandardScaler()
x_rr_scaled = scaler.fit_transform(x_rr)


In [195]:
#interaction term

# Adding interaction term (ratings * review_length)
interaction_rr = x_rr_scaled[:, 0] * x_rr_scaled[:, 1]
x_rr_interaction_scaled = np.column_stack((x_rr_scaled, interaction_rr))
x_rr_interaction_df = pd.DataFrame(x_rr_interaction_scaled, columns=['ratings', 'review_length', 'rating_review_length_interaction'])

#adding y 
y_rr_interaction = data['prediction']

#SMOTE
xs_rr, ys_rr = smote.fit_resample(x_rr_interaction_df, y_rr_interaction)


In [None]:
#regression table
reg_rr = RegressionSummary(xs_rr, ys_rr)
display(reg_rr)

In [None]:
#extracting feature importance scores
fp_rr = FitModels_FeatureImportance(xs_rr, ys_rr, xs_rr.columns.tolist())
display(fp_rr)

In [None]:
#plotting

plt.figure(figsize=(8, 5))
sns.barplot(x='Feature', y='Average_Importance', data=fp_rr, palette='flare', width=0.3)
plt.title('Average Feature Importance Across Ratings*Review_Length')
plt.xlabel('Feature')
plt.ylabel('Average Importance')
plt.xticks(rotation=40)
plt.show()



#### 2. "category" & "rating"

In [None]:
# Original one-hot encoded category features
data_cat = pd.get_dummies(data, columns=['category'], drop_first=False)

# Extracting the rating feature and the one-hot encoded category features
rating = data_cat['rating']
categories = data_cat.drop(columns= ['review_length', 'subcategory', 'rating', 'result', 'product', 'title', 'description', 'id', 'prediction', 'rating'])

# Manually creating interaction terms: rating * each category
interaction_features = pd.DataFrame()

for col in categories.columns:
    interaction_features[f'rating_{col}'] = rating * categories[col]

# Split into features (X) and target (y)
X_comb_cr=interaction_features
y_comb_cr = data['prediction']

print(f"Original feature count: {categories.shape[1] + 1}")  # Rating + 10 categories
print(f"Interaction feature count: {interaction_features.shape[1]}")  # 10 interaction terms


In [201]:
#scalling the feature dataset
scaler = StandardScaler()
X_comb_cr_scaled = scaler.fit_transform(X_comb_cr)
X_comb_cr_scaled = pd.DataFrame(X_comb_cr_scaled, columns=X_comb_cr.columns)

#SMOTE
xs_cr, ys_cr = smote.fit_resample(X_comb_cr_scaled, y_comb_cr)


In [None]:
#regression table
reg_cr = RegressionSummary(xs_cr, ys_cr)
display(reg_cr)

In [None]:
#extracting feature importance scores
fp_cr = FitModels_FeatureImportance(xs_cr, ys_cr, xs_cr.columns.tolist())
display(fp_cr)

In [None]:
#plotting
plt.figure(figsize=(10, 8))
sns.barplot(x='Feature', y='Average_Importance', data=fp_cr, palette='flare')
plt.title('Average Feature Importance Across "ratings * category"')
plt.xlabel('Feature')
plt.ylabel('Average Importance')
plt.xticks(rotation=90)
plt.show()

#### 3.  "category" & "review_length"

In [None]:
# Original one-hot encoded category features
data_cat = pd.get_dummies(data, columns=['category'], drop_first=False)

# Extracting the rating feature and the one-hot encoded category features
review_length = data_cat['review_length']
categories = data_cat.drop(columns= ['review_length', 'subcategory', 'rating', 'result', 'product', 'title', 'description', 'id', 'prediction', 'rating'])

# Manually creating interaction terms: rating * each category
interaction_features = pd.DataFrame()

for col in categories.columns:
    interaction_features[f'review_length_{col}'] = review_length * categories[col]

# Split into features (X) and target (y)
X_comb_crl=interaction_features
y_comb_crl = data['prediction']

print(f"Original feature count: {categories.shape[1] + 1}")  # Rating + 10 categories
print(f"Interaction feature count: {interaction_features.shape[1]}")  # 10 interaction terms


In [206]:
#scalling
scaler = StandardScaler()
X_comb_crl_scaled = scaler.fit_transform(X_comb_crl)
X_comb_crl_scaled = pd.DataFrame(X_comb_crl_scaled, columns=X_comb_crl.columns)

#SMOTE
xs_crl, ys_crl = smote.fit_resample(X_comb_crl_scaled, y_comb_crl)

In [None]:
#regression table
reg_crl = RegressionSummary(xs_crl, ys_crl)
display(reg_crl)

In [None]:
#extracting feature importance scores
fp_crl = FitModels_FeatureImportance(xs_crl, ys_crl, xs_crl.columns.tolist())
display(fp_crl)

In [283]:
#plotting
plt.figure(figsize=(10, 8))
sns.barplot(x='Feature', y='Average_Importance', data=fp_crl, palette='flare')
plt.title('Average Feature Importance Across "review_length * category"')
plt.xlabel('Feature')
plt.ylabel('Average Importance')
plt.xticks(rotation=90)
plt.show()

#### 4. "subcategory" & "rating"

In [None]:
# Original one-hot encoded category features
data_sub = pd.get_dummies(data, columns=['subcategory'], drop_first=False)

# Extracting the rating feature and the one-hot encoded category features
rating = data_sub['rating']
subcategories = data_sub.drop(columns= ['review_length', 'category', 'rating', 'result', 'product', 'title', 'description', 'id', 'prediction', 'rating'])

# Manually creating interaction terms: rating * each category
interaction_features = pd.DataFrame()

for col in subcategories.columns:
    interaction_features[f'rating*{col}'] = rating * subcategories[col]

X_comb_sr=interaction_features
y_comb_sr = data['prediction']

print(f"Original feature count: {subcategories.shape[1] + 1}")  # Rating + 10 categories
print(f"Interaction feature count: {interaction_features.shape[1]}")  # 10 interaction terms



In [211]:
#scalling
scaler = StandardScaler()
X_comb_sr_scaled = scaler.fit_transform(X_comb_sr)
X_comb_sr_scaled = pd.DataFrame(X_comb_sr_scaled, columns=X_comb_sr.columns)

#SMOTE
xs_sr, ys_sr = smote.fit_resample(X_comb_sr_scaled, y_comb_sr)

In [None]:
#regression table
reg_sr = RegressionSummary(xs_sr, ys_sr)
display(reg_sr)

In [None]:
#extracting feature importance scores
fp_sr = FitModels_FeatureImportance(xs_sr, ys_sr, xs_sr.columns.tolist())
display(fp_sr)

In [None]:
#plotting
plt.figure(figsize=(10, 8))
sns.barplot(x='Feature', y='Average_Importance', data=fp_sr, palette='flare')
plt.title('Average Feature Importance Across "rating * subcategory"')
plt.xlabel('Feature')
plt.ylabel('Average Importance')
plt.xticks(rotation=90)
plt.show()

#### 5. "subcategory" & "review_length" 

In [None]:
# Original one-hot encoded category features
data_sub = pd.get_dummies(data, columns=['subcategory'], drop_first=False)

# Extracting the rating feature and the one-hot encoded category features
review_length = data_sub['review_length']
subcategories = data_sub.drop(columns= ['review_length', 'category', 'rating', 'result', 'product', 'title', 'description', 'id', 'prediction', 'rating'])

# Manually creating interaction terms: rating * each category
interaction_features = pd.DataFrame()

for col in subcategories.columns:
    interaction_features[f'review_length*{col}'] = review_length * subcategories[col]


X_comb_srl=interaction_features
y_comb_srl = data['prediction']

print(f"Original feature count: {subcategories.shape[1] + 1}")  # Rating + 10 categories
print(f"Interaction feature count: {interaction_features.shape[1]}")  # 10 interaction terms


In [None]:
#scalling
scaler = StandardScaler()
X_comb_srl_scaled = scaler.fit_transform(X_comb_srl)
X_comb_srl_scaled = pd.DataFrame(X_comb_srl_scaled, columns=X_comb_srl.columns)

#SMOTE
xs_srl, ys_srl = smote.fit_resample(X_comb_srl_scaled, y_comb_srl)


In [None]:
#regression table
reg_srl = RegressionSummary(xs_srl, ys_srl)
display(reg_srl)

In [None]:
#extracting feature importance scores
fp_srl = FitModels_FeatureImportance(xs_srl, ys_srl, xs_srl.columns.tolist())
display(fp_srl)

In [None]:
#plotting
plt.figure(figsize=(10, 8))
sns.barplot(x='Feature', y='Average_Importance', data=fp_srl, palette='flare')
plt.title('Average Feature Importance Across "review_length * subcategory"')
plt.xlabel('Feature')
plt.ylabel('Average Importance')
plt.xticks(rotation=90)
plt.show()

### All independent variable combined

In [213]:
# Separate features and target
x_all = data[['rating', 'review_length', 'category']]
y_all = data['prediction']

In [None]:
#encoding categorical features

encoder = OneHotEncoder(sparse_output=False)  # drop='first' to avoid multicollinearity
encoded_cat = encoder.fit_transform(x_all[['category']])
encoded_cat_df = pd.DataFrame(encoded_cat, columns=encoder.get_feature_names_out(['category']))

#scalling numerical features
numerical_data = x_all[['rating', 'review_length']]
scaler = StandardScaler()  # or MinMaxScaler()
scaled_numerical_data = scaler.fit_transform(numerical_data)
scaled_numerical_data_df = pd.DataFrame(scaled_numerical_data, columns=['rating', 'review_length'])

#combining the features
x_transformed = pd.concat([encoded_cat_df, scaled_numerical_data_df], axis=1)

#SMOTE
xs_all, ys_all = smote.fit_resample(x_transformed, y_all)


In [None]:
#regression table
reg_all = RegressionSummary(xs_all, ys_all)
display(reg_all)

In [None]:
#extracting feature importance scores
fp_all= FitModels_FeatureImportance(xs_all, ys_all, xs_all.columns.tolist())
display(fp_all)

In [None]:
#plotting
plt.figure(figsize=(10, 8))
sns.barplot(x='Feature', y='Average_Importance', data=fp_all, palette='flare')
plt.title('Average Feature Importance Across all independent variables')
plt.xlabel('Feature')
plt.ylabel('Average Importance')
plt.xticks(rotation=90)
plt.show()