In [2]:
import numpy as np
import pandas as pd
import csv
import string
import emoji
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from afinn import Afinn
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
#imports above not used?
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
import joblib
import pickle

#import data
df_updated = pd.read_csv("processed_data.csv")
print(df_updated.shape[1])

#-----------MODELS AND 10-FOLD CROSS VALIDATION---------------
#-----------HYPERPARAMETER TUNING VIA GRID SEARCH ------------
#-----------SAVING OF EACH RESULT IN JOBLIB FILE--------------

#splitting of data
X = df_updated
scaler = MinMaxScaler(feature_range=(0, 1))
X['sentiment'] = scaler.fit_transform(X[['sentiment_score']])
X_scaled = scaler.fit_transform(X.iloc[:, 21:])
X.iloc[:, 21:] = X_scaled

X = X.drop(['is_fake','TOTAL_TEXT','text_cleaned','sentiment_score'], axis=1)
y = df_updated['is_fake']
print(X.dtypes)
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size=0.8, test_size=0.2, random_state=None)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\longb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\longb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\longb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


70
RATING                 int64
VERIFIED_PURCHASE      int64
word_count             int64
caps_count             int64
punct_count            int64
                      ...   
46                   float64
47                   float64
48                   float64
49                   float64
sentiment            float64
Length: 67, dtype: object


In [2]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [1],
}

regressor = LogisticRegression(max_iter=10000, solver='saga')

grid_search = GridSearchCV(regressor, param_grid, cv=kf)
grid_search.fit(X_train, y_train)
best_logreg = grid_search.best_estimator_


y_pred = best_logreg.predict(X_valid)

print("Logistic Regression")
print(classification_report(y_valid, y_pred))
print(accuracy_score(y_valid, y_pred))

scores = cross_val_score(best_logreg, X, y, cv=kf)
mean_score = np.mean(scores)
std_score = np.std(scores)

print(f"Mean accuracy score: {mean_score:.2f}")
print(f"Standard deviation: {std_score:.2f}")
print("************************")

with open('results.csv', 'a', newline='') as csvfile:
    fieldnames = ['Model', 'Parameters', 'Accuracy', 'Mean Score', 'Std Score']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writerow({
        'Model': 'Logistic Regression',
        'Parameters': str(best_logreg),
        'Accuracy': accuracy_score(y_valid, y_pred),
        'Mean Score': mean_score,
        'Std Score': std_score
    })


Logistic Regression
              precision    recall  f1-score   support

           0       0.77      0.86      0.81      2201
           1       0.83      0.71      0.77      1999

    accuracy                           0.79      4200
   macro avg       0.80      0.79      0.79      4200
weighted avg       0.80      0.79      0.79      4200

0.7923809523809524
Mean accuracy score: 0.79
Standard deviation: 0.01
************************


In [3]:
param_grid = {
    'hidden_layer_sizes': [(50,)],
    'activation': ['logistic'],
    'alpha': [0.001],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['adaptive'],
    'batch_size': [128],
    'max_iter': [500]
}

mlp = MLPClassifier()
grid_search = GridSearchCV(mlp, param_grid, cv=kf)
grid_search.fit(X_train, y_train)
best_mlp = grid_search.best_estimator_
y_pred = best_mlp.predict(X_valid)
print(grid_search.best_score_)

print("Multi Layer Perceptron")
print(classification_report(y_valid, y_pred))
print("Accuracy: ", accuracy_score(y_valid, y_pred))

scores = cross_val_score(best_mlp, X, y, cv=kf)
mean_score = np.mean(scores)
std_score = np.std(scores)

print(f"Mean accuracy score: {mean_score:.2f}")
print(f"Standard deviation: {std_score:.2f}")
print("************************")

with open('results.csv', 'a', newline='') as csvfile:
    fieldnames = ['Model', 'Parameters', 'Accuracy', 'Mean Score', 'Std Score']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writerow({
        'Model': 'MLP1',
        'Parameters': str(best_mlp),
        'Accuracy': accuracy_score(y_valid, y_pred),
        'Mean Score': mean_score,
        'Std Score': std_score
    })

0.8055357142857142
Multi Layer Perceptron
              precision    recall  f1-score   support

           0       0.79      0.88      0.83      2201
           1       0.84      0.75      0.79      1999

    accuracy                           0.81      4200
   macro avg       0.82      0.81      0.81      4200
weighted avg       0.82      0.81      0.81      4200

Accuracy:  0.8145238095238095
Mean accuracy score: 0.81
Standard deviation: 0.01
************************


In [4]:
param_grid = {
    'hidden_layer_sizes': [(50,)],
    'activation': ['logistic'],
    'alpha': [0.0001, 0.001, 0.01],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['adaptive'],
    'batch_size': [32],
    'max_iter': [2000]
}

mlp = MLPClassifier()
grid_search = GridSearchCV(mlp, param_grid, cv=kf)
grid_search.fit(X_train, y_train)
best_mlp = grid_search.best_estimator_
y_pred = best_mlp.predict(X_valid)
print(grid_search.best_score_)

print("Multi Layer Perceptron")
print(classification_report(y_valid, y_pred))
print("Accuracy: ", accuracy_score(y_valid, y_pred))

scores = cross_val_score(best_mlp, X, y, cv=kf)
mean_score = np.mean(scores)
std_score = np.std(scores)

print(f"Mean accuracy score: {mean_score:.2f}")
print(f"Standard deviation: {std_score:.2f}")
print("************************")

with open('results.csv', 'a', newline='') as csvfile:
    fieldnames = ['Model', 'Parameters', 'Accuracy', 'Mean Score', 'Std Score']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writerow({
        'Model': 'MLP2',
        'Parameters': str(best_mlp),
        'Accuracy': accuracy_score(y_valid, y_pred),
        'Mean Score': mean_score,
        'Std Score': std_score
    })

0.8078571428571429
Multi Layer Perceptron
              precision    recall  f1-score   support

           0       0.80      0.84      0.82      2201
           1       0.81      0.77      0.79      1999

    accuracy                           0.81      4200
   macro avg       0.81      0.81      0.81      4200
weighted avg       0.81      0.81      0.81      4200

Accuracy:  0.8073809523809524
Mean accuracy score: 0.81
Standard deviation: 0.01
************************


In [5]:
param_grid = {
    'hidden_layer_sizes': [(100,)],
    'activation': ['logistic'],
    'alpha': [0.0001, 0.001, 0.01],
    'solver': ['adam'],
    'learning_rate': ['adaptive'],
    'batch_size': [32, 64, 128],
    'max_iter': [500, 1000, 2000]
}

mlp = MLPClassifier()
grid_search = GridSearchCV(mlp, param_grid, cv=kf)
grid_search.fit(X_train, y_train)
best_mlp = grid_search.best_estimator_
y_pred = best_mlp.predict(X_valid)
print(grid_search.best_score_)

print("Multi Layer Perceptron")
print(classification_report(y_valid, y_pred))
print("Accuracy: ", accuracy_score(y_valid, y_pred))

scores = cross_val_score(best_mlp, X, y, cv=kf)
mean_score = np.mean(scores)
std_score = np.std(scores)

print(f"Mean accuracy score: {mean_score:.2f}")
print(f"Standard deviation: {std_score:.2f}")
print("************************")

with open('results.csv', 'a', newline='') as csvfile:
    fieldnames = ['Model', 'Parameters', 'Accuracy', 'Mean Score', 'Std Score']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writerow({
        'Model': 'MLP3',
        'Parameters': str(best_mlp),
        'Accuracy': accuracy_score(y_valid, y_pred),
        'Mean Score': mean_score,
        'Std Score': std_score
    })

joblib.dump(best_mlp, 'best_params/mlp3_best_params.pkl')

0.8082738095238096
Multi Layer Perceptron
              precision    recall  f1-score   support

           0       0.80      0.83      0.82      2201
           1       0.80      0.78      0.79      1999

    accuracy                           0.80      4200
   macro avg       0.80      0.80      0.80      4200
weighted avg       0.80      0.80      0.80      4200

Accuracy:  0.8030952380952381
Mean accuracy score: 0.81
Standard deviation: 0.01
************************


['best_params/mlp3_best_params.pkl']

In [7]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_valid)
print("SVM")
print(classification_report(y_valid, y_pred))
print("Accuracy:", accuracy_score(y_valid, y_pred))


SVM
              precision    recall  f1-score   support

           0       0.77      0.88      0.82      2201
           1       0.84      0.71      0.77      1999

    accuracy                           0.80      4200
   macro avg       0.81      0.80      0.80      4200
weighted avg       0.80      0.80      0.80      4200

Accuracy: 0.7997619047619048




In [8]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_valid)
print("SVM")
print(classification_report(y_valid, y_pred))
print("Accuracy:", accuracy_score(y_valid, y_pred))


SVM
              precision    recall  f1-score   support

           0       0.75      0.86      0.80      2201
           1       0.82      0.68      0.74      1999

    accuracy                           0.78      4200
   macro avg       0.78      0.77      0.77      4200
weighted avg       0.78      0.78      0.77      4200

Accuracy: 0.775952380952381


In [4]:
svm = LinearSVC()


param_grid = {'penalty': ['l1','l2'],
              'loss': ['hinge', 'squared_hinge'],
              'dual':[False],
              'C': [0.1, 1, 10, 100],
              'max_iter':[10000]
              }

grid_search = GridSearchCV(svm, param_grid, cv=kf)
grid_search.fit(X_train, y_train)
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_valid)


print("SVM")
print(classification_report(y_valid, y_pred))
print("Accuracy:", accuracy_score(y_valid, y_pred))

scores = cross_val_score(best_svm, X, y, cv=kf)
mean_score = np.mean(scores)
std_score = np.std(scores)

print(f"Mean accuracy score: {mean_score:.2f}")
print(f"Standard deviation: {std_score:.2f}")
print("************************")

with open('results.csv', 'a', newline='') as csvfile:
    fieldnames = ['Model', 'Parameters', 'Accuracy', 'Mean Score', 'Std Score']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writerow({
        'Model': 'SVM',
        'Parameters': str(best_svm),
        'Accuracy': accuracy_score(y_valid, y_pred),
        'Mean Score': mean_score,
        'Std Score': std_score
    })

joblib.dump(best_svm, 'best_params/svm_best_params.pkl')

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\longb\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\longb\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\svm\_classes.py", line 274, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
                                           ^^^^^^^^^^^^^^^
  File "c:\Users\longb\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\svm\_base.py", line 1223, in _fit_liblinear
    solver

SVM
              precision    recall  f1-score   support

           0       0.80      0.86      0.83      2142
           1       0.84      0.77      0.81      2058

    accuracy                           0.82      4200
   macro avg       0.82      0.82      0.82      4200
weighted avg       0.82      0.82      0.82      4200

Accuracy: 0.8173809523809524
Mean accuracy score: 0.80
Standard deviation: 0.01
************************


['best_params/svm_best_params.pkl']

In [5]:
svm = SVC()


param_grid = {'C': [0.1, 1, 10, 100],'gamma': [0.1, 1, 10, 100]}

grid_search = GridSearchCV(svm, param_grid, cv=kf)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_estimator_
y_pred = grid_search.best_estimator_.predict(X_valid)


print("SVM")
print(classification_report(y_valid, y_pred))
print("Accuracy:", accuracy_score(y_valid, y_pred))

scores = cross_val_score(best_params, X, y, cv=kf)
mean_score = np.mean(scores)
std_score = np.std(scores)

print(f"Mean accuracy score: {mean_score:.2f}")
print(f"Standard deviation: {std_score:.2f}")
print("************************")

with open('results.csv', 'a', newline='') as csvfile:
    fieldnames = ['Model', 'Parameters', 'Accuracy', 'Mean Score', 'Std Score']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writerow({
        'Model': 'Gaussian SVM',
        'Parameters': str(best_params),
        'Accuracy': accuracy_score(y_valid, y_pred),
        'Mean Score': mean_score,
        'Std Score': std_score
    })

joblib.dump(best_params, 'best_params/gaussiansvm_best_params.pkl')

SVM
              precision    recall  f1-score   support

           0       0.76      0.91      0.83      2142
           1       0.88      0.70      0.78      2058

    accuracy                           0.81      4200
   macro avg       0.82      0.80      0.80      4200
weighted avg       0.82      0.81      0.80      4200

Accuracy: 0.8069047619047619
Mean accuracy score: 0.79
Standard deviation: 0.01
************************


['best_params/gaussiansvm_best_params.pkl']