# 4. Baseline classification models

To select a model to then finetune, first I am going to compare the baseline sklearn models trained on the dataset with different feature sets. 

<!-- Prefiltering -->

In [54]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np 

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support


import pickle

from json import JSONEncoder
import json
import numpy

class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, numpy.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

## Functions

To speed up the process a little bit I will create some functions to fit different models according to different features and output a dictionary of results we can analyze.

In [55]:

# first I will define a couple of functions to speed up the process and make it reusable

def data_preprocess(data):
  
  '''We input the data with the linguistic features
  and it returns the data with the polarity columns in 0 for fake and 1 for true
  as well as the outcomes
  '''

  if data["Polarity"][0] != 0 or 1:
    data["Polarity"] = data["Polarity"].replace({"True":1, "Fake":0, "TRUE":1})
    
  outcomes = ["Fake","Real"]
  
  return data, outcomes

def test_train(data, features):

  '''We input the training data and a list of the features we want to train the models with.
  it returns the data split in test/train
  '''

  feature_cols = features
  # data = data[feature_cols + ["Polarity"]].dropna()

  X = data[feature_cols]
  y = data["Polarity"] #outcomes 0 or 1


  print("Info: {} features were passed at the fit step\n:".format(X.shape[1]))
  for feature in feature_cols:
    print(feature)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, shuffle=True, random_state=16) #stratify so that the proportion of the train data is the same for fake and true

  return X, y, X_train, X_test, y_train, y_test, features

def class_report(y_test, y_pred):

  outcomes = ["Fake","Real"]

  scores = classification_report(y_test, y_pred, target_names=outcomes)

  print(scores)

  return scores

def compare_models_cross_val(model, X, y):
  
  '''for each model, it calculates the cross validation accuracy for k-n
  as well as the mean cross validation score on the given data

  '''

  print("CROSS VALIDATION\n")

  model_name = str(model)[:-2]

  if model_name == "SVC(kernel=\'linear":

    model_name = "SVC"


  # for model in models:
  kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  cv_score = cross_val_score(model, X, y, cv = kfold, scoring="f1") # we have to change the metric to the one we want to measure the models by
  print("Cross validation f1 {} for each of the 5 iterations:".format(model_name))
  for score in cv_score:
    print(round(score*100, 2), "%")

  mean_f1 = np.mean(cv_score)

  print("\nCross validation mean f1 for {}:".format(model_name), round(mean_f1*100, 2),"%")
  print("\n-------------------------------------------------------------------------\n")

  return cv_score, mean_f1

# def conf_matrix(model_name, features, y_pred, y_test, cmap="magma"):

#   '''This function takes the predicted and test labels, generates the confusion matrix
#   and displays it
#   '''

#   n_features = len(features)

#   confussion_matrix = confusion_matrix(y_pred, y_test)

#   outcomes = ["Fake","Real"]
#   ticks = np.arange(len(outcomes))

#   fig, ax = plt.subplots()
#   plt.xticks(ticks, outcomes)
#   plt.yticks(ticks, outcomes)
#   sns.heatmap(pd.DataFrame(confussion_matrix), annot=True, cmap=cmap, fmt="g", xticklabels=outcomes, yticklabels=outcomes)
#   ax.xaxis.set_label_position("top")
#   plt.tight_layout()
#   if n_features == 1:

#     plt.title("{} Confusion Matrix: {}".format(model_name, features[0]), y = 1.1)

#   elif n_features == 2:

#     plt.title("{} Confusion Matrix: {} and {}".format(model_name, features[0],features[1]), y = 1.1)

#   elif n_features == 3:

#     plt.title("{} Confusion Matrix: {}, {} and {}".format(model_name, features[0],features[1], features[2]), y = 1.1)

#   plt.ylabel("Actual label")
#   plt.xlabel("Predicted label")

#   return confussion_matrix

def compare_models_train_test_split(models, X, y, X_train, y_train, X_test, y_test, features):

  '''this fx trains the data on the four types of models,
  generates a report with the overall accuracy of the model, the cross validation evaluation
  and prints the confussion matrix of each model on the particular test_train split.

  It returns a dictionary with the name of the model, the features that were passed during the fit,
  the model used, the accuracy score, the classification report, the cross-validation evaluation and the confusion matrix
  '''

  model_list_dicts =[]

  for model in models:
    
    model_name = str(model)[:-2]

    if model_name == "SVC(kernel=\'linear":

      model_name = "SVC"
    
    model_dict = {}
    
    if len(features) == 14:
      ft_str = "rdfc"
      
    elif len(features) == 8:
      ft_str = "corr"
    
    elif len(features) == 28:
      ft_str = "kolmogorov_features"
      
    else: 
      ft_str = "all_features"
    
    model_dict["name"] = ft_str + "__" + model_name
    
    model_dict["features_set"] = ft_str
    
    model_dict["n_features"] = len(features)

    model_dict["model"] = model_name
    
    print("\n",model_name, "\n", "\n")
    
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    
    model_dict["accuracy"] = accuracy
    
    model_dict["f1"] = f1_score(y_test, y_pred)
    
    model_dict["precision"] = precision_score(y_test, y_pred)
    
    model_dict["recall"] = recall_score(y_test, y_pred)
     
     
    print("OVERALL F1", model_name, ":", round(accuracy*100, 2),"%""\n")
    
    scores = class_report(y_test, y_pred)
    
    P_R_F1_true = precision_recall_fscore_support(y_test, y_pred, labels=[1])
    precision_true, recall_true , f1_score_true  = P_R_F1_true[0], P_R_F1_true[1] ,  P_R_F1_true[2]
    
    P_R_F1_false = precision_recall_fscore_support(y_test, y_pred, labels=[0])
    precision_false, recall_false, f1_score_false = P_R_F1_false[0], P_R_F1_false[1], P_R_F1_false[2]
    
    model_dict["precision_true"], model_dict["recall_true"], model_dict["f1_score_true"] = precision_true, recall_true, f1_score_true
    model_dict["precision_false"], model_dict["recall_false"], model_dict["f1_score_false"] = precision_false, recall_false, f1_score_false
    
    cv_score, mean_f1 = compare_models_cross_val(model, X, y)
    
    model_dict["cv_scores"] = cv_score

    model_dict["mean_f1"] = mean_f1
    
    # confussion_matrix = conf_matrix(model_name, features, y_pred, y_test, cmap="magma")
    
    # model_dict["confussion_matrix"] = confussion_matrix
    
    model_dict["development"] = "baseline"

    model_list_dicts.append(model_dict)
    
    pickle.dump(model, open(r"Baseline_Models\{}.sav".format(ft_str + "__" + model_name), 'wb'))
    
      
    with open(r"Baseline_Models_Performance\{}.json".format(model_dict["name"]), "w") as f:
      json.dump(model_dict, f,  cls=NumpyArrayEncoder)
  
  
  return model_list_dicts

def magic(data, features):

  '''this function takes all the previous functions and integrates them into a single function to run the data processing, training and testing in all the models.

  It returns a dictionary with the name of the model, the features that were passed during the fit,
  the model used, the accuracy score, the classification report, the crossvalidation evaluation and the confusion matrix

  '''

  data, outcomes = data_preprocess(data) # data in the correct format for the y outcomes

  X, y, X_train, X_test, y_train, y_test, features = test_train(data, features) # data split for all the models with the desired features

  model_list_dicts = compare_models_train_test_split(models, X, y, X_train, y_train, X_test, y_test, features) # for each model it will compute

  return model_list_dicts


## Creating the models

<!-- array([['DESPC', '0.21438229320956675'],
       ['DESPLw', '0.10208405348019169'],
       ['WORD_SET_INCIDENCE_C4_COMMON_WORDS', '0.09965943628298403'],
       ['DESWC', '0.07109169917686431'],
       ['DESSL', '0.05266669162099124'],
       ['TOKEN_ATTRIBUTE_RATIO_ALHPA', '0.044887257624143466'],
       ['DESPL', '0.04316248532779844'],
       ['DESSLd', '0.02820966410727794'],
       ['SYNNP', '0.027405633313671348'],
       ['WORD_PROPERTY_WRDVERB', '0.0261010048275167'],
       ['WORD_SET_INCIDENCE_WRDPRP3p', '0.025068322443046843'],
       ['WORD_PROPERTY_WRDFRQa', '0.02254653008839199'],
       ['SYNLE', '0.02108048270836009'],
       ['WORD_PROPERTY_WRDHYPnv', '0.019784971882906673'],
       ['READFKGL', '0.018508796386778235'],
       ['WORD_PROPERTY_CONCRETENESS', '0.017746184163603552'],
       ['SYNMEDlem', '0.017576571201848724'],
       ['WORD_SET_INCIDENCE_CNCCaus', '0.016597094842558495'],
       ['TOKEN_ATTRIBUTE_RATIO_DIGIT', '0.016312327694308282'],
       ['SYNMEDwrd', '0.01586403979613771'],
       ['WORD_PROPERTY_WRDFRQc', '0.01478517755295334'],
       ['DESPLd', '0.013884163425386713'],
       ['DESWLltd', '0.01311359523902935'],
       ['Positive_Sentiment', '0.01210902813506827'],
       ['RDFRE', '0.011349995152157417'],
       ['SYNSTRUTa', '0.011137636817407863'],
       ['WORD_PROPERTY_WRDNOUN', '0.007141541942563552']], dtype='<U34') -->

### Loading the training dataset and the feature sets

In [56]:
# reloading the transformed data we saved early on

transformed_data = pd.read_csv(r"Datasets\Celebrity Dataset\Celebrity_dataset_transformed.csv")
# transformed_data = transformed_data.drop(labels="Unnamed: 0", axis=1)
print(transformed_data.shape)
transformed_data

(450, 65)


Unnamed: 0,DESPC,DESSC,DESWC,DESPL,DESPLd,DESPLw,DESSL,DESSLd,DESWLsy,DESWLsyd,...,WORD_PROPERTY_AOA_MAX,WORD_PROPERTY_CONCRETENESS,WORD_PROPERTY_PREVALENCE,WORD_SET_INCIDENCE_C4_COMMON_WORDS,Polarity,Overall_Sentiment,Positive_Sentiment,Negative_Sentiment,Neutral_Sentiment,Subjectivity
0,0.444444,0.000000,0.146154,0.027027,0.000000,0.151143,0.682139,0.000000,1.000000,0.563762,...,0.396186,0.393986,0.674648,0.685198,Fake,0.7783,0.033,0.109,0.858,0.332143
1,0.222222,0.176471,0.253846,0.148649,0.176471,0.201948,0.249849,0.248429,0.109532,0.116088,...,0.309322,0.612986,0.682585,0.252618,Fake,0.7763,0.045,0.106,0.849,0.513112
2,0.222222,0.235294,0.276923,0.189189,0.235294,0.220999,0.191726,0.215594,0.199330,0.134906,...,0.297608,0.685996,0.654967,0.546360,Fake,-0.2944,0.150,0.123,0.726,0.397129
3,0.222222,0.176471,0.307692,0.148649,0.176471,0.240051,0.300706,0.189370,0.191279,0.119235,...,0.147246,0.814876,0.426085,0.367094,Fake,0.6486,0.000,0.032,0.968,0.497222
4,0.222222,0.294118,0.296154,0.229730,0.294118,0.220999,0.147614,0.110956,0.139367,0.084885,...,0.617585,0.218604,0.515320,0.550734,Fake,0.9531,0.000,0.145,0.855,0.454167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,0.222222,0.176471,0.296154,0.148649,0.176471,0.227350,0.289808,0.164698,0.264674,0.203276,...,0.306847,0.621988,0.530017,0.516772,True,0.6705,0.017,0.069,0.914,0.428052
446,0.888889,0.117647,0.273077,0.010811,0.037203,0.053345,0.386983,0.384460,0.050249,0.047174,...,0.323093,0.499129,0.691053,0.339440,True,0.9590,0.024,0.180,0.796,0.395543
447,0.444444,0.000000,0.130769,0.000000,0.048029,0.075642,0.645812,0.000000,0.370429,0.251662,...,0.606992,0.835890,0.640981,0.230619,True,0.7964,0.000,0.090,0.910,0.620455
448,0.222222,0.470588,0.534615,0.351351,0.352941,0.337426,0.153582,0.189306,0.225581,0.206593,...,0.423729,0.532790,0.651265,0.156495,True,0.9532,0.089,0.175,0.736,0.473939


In [57]:
import json

with open(r"Feature_Sets\features.json", "r") as f:
    feature_sets = json.load(f)


In [58]:
feature_sets

{'corr': ['DESPC',
  'DESPL',
  'DESPLw',
  'DESSL',
  'DESSLd',
  'READFKGL',
  'TOKEN_ATTRIBUTE_RATIO_ALHPA',
  'WORD_SET_INCIDENCE_C4_COMMON_WORDS'],
 'rfecv': ['DESPC',
  'DESWC',
  'DESPL',
  'DESPLd',
  'DESPLw',
  'DESSL',
  'DESSLd',
  'DESWLlt',
  'SYNNP',
  'SYNMEDlem',
  'READFKGL',
  'TOKEN_ATTRIBUTE_RATIO_ALHPA',
  'TOKEN_ATTRIBUTE_RATIO_PUNCT',
  'WORD_SET_INCIDENCE_C4_COMMON_WORDS'],
 'kolmogorov': ['DESPC',
  'DESWC',
  'DESPL',
  'DESPLd',
  'DESPLw',
  'DESSL',
  'DESSLd',
  'DESWLltd',
  'SYNLE',
  'SYNNP',
  'SYNMEDpos',
  'SYNMEDwrd',
  'SYNMEDlem',
  'SYNSTRUTa',
  'RDFRE',
  'READFKGL',
  'TOKEN_ATTRIBUTE_RATIO_ALHPA',
  'TOKEN_ATTRIBUTE_RATIO_DIGIT',
  'WORD_SET_INCIDENCE_WRDPRP3p',
  'WORD_SET_INCIDENCE_CNCCaus',
  'WORD_PROPERTY_WRDNOUN',
  'WORD_PROPERTY_WRDVERB',
  'WORD_PROPERTY_WRDFRQc',
  'WORD_PROPERTY_WRDFRQa',
  'WORD_PROPERTY_WRDHYPnv',
  'WORD_PROPERTY_CONCRETENESS',
  'WORD_SET_INCIDENCE_C4_COMMON_WORDS',
  'Positive_Sentiment'],
 'all_features': ['

In [59]:
list_features = [value for key, value in feature_sets.items()]


In [60]:
lg = LogisticRegression(random_state=42)
svc = SVC(kernel ="linear", random_state=42)
knn = KNeighborsClassifier()
rf = RandomForestClassifier(random_state=42)

In [61]:
models = [LogisticRegression(), SVC(kernel ="linear"), KNeighborsClassifier(), RandomForestClassifier(random_state=42)]

### Mini-report of the creation of the models

In [62]:
for feature in list_features:
    magic(transformed_data, feature)

Info: 8 features were passed at the fit step
:
DESPC
DESPL
DESPLw
DESSL
DESSLd
READFKGL
TOKEN_ATTRIBUTE_RATIO_ALHPA
WORD_SET_INCIDENCE_C4_COMMON_WORDS

 LogisticRegression 
 

OVERALL F1 LogisticRegression : 73.45 %

              precision    recall  f1-score   support

        Fake       0.71      0.74      0.72        53
        Real       0.76      0.73      0.75        60

    accuracy                           0.73       113
   macro avg       0.73      0.73      0.73       113
weighted avg       0.74      0.73      0.73       113

CROSS VALIDATION

Cross validation f1 LogisticRegression for each of the 5 iterations:
71.29 %
72.34 %
69.57 %
75.79 %
80.0 %

Cross validation mean f1 for LogisticRegression: 73.8 %

-------------------------------------------------------------------------


 SVC 
 

OVERALL F1 SVC : 75.22 %

              precision    recall  f1-score   support

        Fake       0.72      0.77      0.75        53
        Real       0.79      0.73      0.76        6

  data["Polarity"] = data["Polarity"].replace({"True":1, "Fake":0, "TRUE":1})


OVERALL F1 RandomForestClassifier(random_state=4 : 78.76 %

              precision    recall  f1-score   support

        Fake       0.78      0.75      0.77        53
        Real       0.79      0.82      0.80        60

    accuracy                           0.79       113
   macro avg       0.79      0.79      0.79       113
weighted avg       0.79      0.79      0.79       113

CROSS VALIDATION

Cross validation f1 RandomForestClassifier(random_state=4 for each of the 5 iterations:
81.25 %
75.51 %
77.78 %
85.11 %
92.93 %

Cross validation mean f1 for RandomForestClassifier(random_state=4: 82.51 %

-------------------------------------------------------------------------

Info: 14 features were passed at the fit step
:
DESPC
DESWC
DESPL
DESPLd
DESPLw
DESSL
DESSLd
DESWLlt
SYNNP
SYNMEDlem
READFKGL
TOKEN_ATTRIBUTE_RATIO_ALHPA
TOKEN_ATTRIBUTE_RATIO_PUNCT
WORD_SET_INCIDENCE_C4_COMMON_WORDS

 LogisticRegression 
 

OVERALL F1 LogisticRegression : 82.3 %

              precision    recal

<!-- All features input -->