In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow import keras

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("train_cleaned.csv")
df.head()

Unnamed: 0,review_rating,number_of_photos,helpful_vote,reviewer_ID,fake_asin,fake_review,product_ID,review_ID,review,year,month,day,is_weekday,polarity,subjectivity,labeled_product_id
0,5,0,0,380263,1,1,389,34510,super product husband hard time sleep first ni...,2020,4,13,1,0.315278,0.565278,389
1,5,0,0,845137,0,0,99,381688,sturdi rving,2020,8,29,0,0.0,0.0,0
2,5,0,0,659759,0,0,107,417933,five star dramat improv toe nail appear,2018,2,27,1,0.0,0.0,0
3,5,0,1,73493,0,0,66,166465,great portabl irrig recent got brace need work...,2012,5,16,1,0.269388,0.631293,0
4,4,0,0,714841,0,0,14,289006,four star salad shooter work great save time,2015,1,5,1,0.8,0.75,0


In [4]:
X = df.drop(['reviewer_ID', 'fake_asin', 'product_ID', 'review_ID', 'review', 'fake_review'], axis = 1)

# get dummies
X = pd.get_dummies(X, columns=['labeled_product_id'])
X = X.drop(['labeled_product_id_0'], axis = 1)

# scaling
X = X.apply(pd.to_numeric)
stander = StandardScaler()

X = stander.fit_transform(X)

In [5]:
y = df['fake_review']

In [6]:
model = keras.models.Sequential()


model.add(keras.layers.Dense(2048, activation='relu', input_dim=X.shape[1]))

model.add(keras.layers.Dense(1024, activation='relu'))

model.add(keras.layers.Dense(512, activation='relu'))

model.add(keras.layers.Dense(256, activation='relu'))

model.add(keras.layers.Dense(128, activation='relu'))

model.add(keras.layers.Dense(1, activation='sigmoid')) 

In [7]:
model.compile(loss=keras.losses.BinaryFocalCrossentropy(), optimizer='adam', metrics=['accuracy'])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 12343)

In [9]:
# compute the class weights
n_samples = len(y_train)
n_classes = len(np.unique(y_train))
class_counts = np.bincount(y_train)
class_weights = {i: n_samples / (n_classes * class_counts[i]) for i in range(n_classes)}

# fit the model with sample weights
sample_weights = np.array([class_weights[label] for label in y_train])

from keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')

model.fit(X_train, y_train, sample_weight=sample_weights, epochs = 50, validation_split=0.2, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2418940ebb0>

In [10]:
from sklearn.metrics import confusion_matrix
# Get the predicted labels and true labels
y_pred = model.predict(X_test)

def my_fuc(x):
    if x < 0.65:
        return 0
    else:
        return 1
    
y_pred = np.array([my_fuc(x) for x in y_pred])

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print(cm)

from sklearn.metrics import f1_score
print(f1_score(y_test, y_pred))

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

[[107957   3568]
 [  3644   2394]]
0.399
              precision    recall  f1-score   support

           0       0.97      0.97      0.97    111525
           1       0.40      0.40      0.40      6038

    accuracy                           0.94    117563
   macro avg       0.68      0.68      0.68    117563
weighted avg       0.94      0.94      0.94    117563



In [11]:
df_test = pd.read_csv("test_cleaned.csv")
df_test_sample = df_test.drop(['reviewer_ID', 'product_ID', 'review_ID', 'review', 'fake_asin'], axis = 1)

df_test_sample = pd.get_dummies(df_test_sample, columns=['labeled_product_id'])
df_test_sample = df_test_sample.drop(['labeled_product_id_0'], axis = 1)

stander = StandardScaler()
df_test_sample = stander.fit_transform(df_test_sample)
df_test_sample

array([[-0.16259809,  2.13497981, -0.08194928, ..., -0.11757016,
        -0.11736967, -0.01172636],
       [ 0.58756373, -0.17814909, -0.08194928, ..., -0.11757016,
        -0.11736967, -0.01172636],
       [ 0.58756373, -0.17814909, -0.08194928, ..., -0.11757016,
        -0.11736967, -0.01172636],
       ...,
       [ 0.58756373, -0.17814909, -0.08194928, ..., -0.11757016,
        -0.11736967, -0.01172636],
       [ 0.58756373, -0.17814909, -0.08194928, ..., -0.11757016,
        -0.11736967, -0.01172636],
       [ 0.58756373, -0.17814909, -0.08194928, ..., -0.11757016,
        -0.11736967, -0.01172636]])

In [12]:
predictions = model.predict(df_test_sample)
predictions



array([[0.46258885],
       [0.48655397],
       [0.7712257 ],
       ...,
       [0.45819482],
       [0.41771412],
       [0.5029743 ]], dtype=float32)

In [13]:
df_test["fake_review"] = predictions

In [14]:
output = df_test[["review_ID", "fake_review"]]

In [15]:
output['fake_review'] = output['fake_review'].apply(lambda x: 0 if x < 0.65 else 1)

In [16]:
output.to_csv('result.csv', index=False)