In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [61]:
# adding the json encoder for np arrays
from json import JSONEncoder
import json
import numpy

class NumpyArrayEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, numpy.ndarray):
            return obj.tolist()
        return JSONEncoder.default(self, obj)

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn import model_selection

import pickle
import sys
import joblib


def data_preprocess(data):
  '''We input the data with the linguistic features
  And it returns the data with the polarity columns in 0 for fake and 1 for true
  as well as the outcomes
  '''

  if data.Polarity[0] != 0 or 1:
    data.Polarity[data.Polarity == 'Fake'] = 0
    data.Polarity[data.Polarity == 'True'] = 1

  outcomes = ["Fake","Real"]
  # print("unique Polairty labels:", data.Polarity.unique())

  return data, outcomes


def test_train(data, features):

  '''We input the data with the features and a list of the features we want to pass to the models.
  it returns the data split in test/train
  '''

  data = data.dropna()
  feature_cols = features



  X = data[feature_cols]
  y = data.Polarity #outcomes 0 or 1
  y_ref= "Polarity"

  print("Info: {} features were passed at the fit step\n:".format(X.shape[1]))
  for feature in feature_cols:
    print(feature)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=16)

  return X, y, X_train, X_test, y_train, y_test, features, y_ref

  #stratify so that the proportion of the train data is the same for fake and true


def class_report(y_test, y_pred):

  outcomes = ["Fake","Real"]

  scores = classification_report(y_test, y_pred, target_names=outcomes)

  print(scores)

  return scores

def compare_models_cross_val(model, X, y):
  '''for each model, it calculates the cross validation accuracy for each iteration
  as well as the mean cross validation score on the given data

  '''

  print("CROSS VALIDATION\n")

  model_name = str(model)[:-2]

  if model_name == "SVC(kernel=\'linear":

    model_name = "SVC"


  # for model in models:
  cv_score = cross_val_score(model, X, y, cv = 5)
  print("Accuracy {} for each of the 5 iterations:".format(model_name))
  for score in cv_score:
    print(round(score*100, 2), "%")

  mean_accuracy = sum(cv_score)/len(cv_score)

  print("\nCross validation mean accuracy for {}:".format(model_name), round(mean_accuracy*100, 2),"%")
  print("\n-------------------------------------------------------------------------\n")

  return cv_score, mean_accuracy

def conf_matrix(model_name, features, y_pred, y_test, cmap="magma"):

  '''This function takes the predicted and test labels, generates the confusion matrix
  and displays it
  '''

  n_features = len(features)

  confussion_matrix = confusion_matrix(y_pred, y_test)

  outcomes = ["Fake","Real"]
  ticks = np.arange(len(outcomes))

  fig, ax = plt.subplots()
  plt.xticks(ticks, outcomes)
  plt.yticks(ticks, outcomes)
  sns.heatmap(pd.DataFrame(confussion_matrix), annot=True, cmap=cmap, fmt="g", xticklabels=outcomes, yticklabels=outcomes)
  ax.xaxis.set_label_position("top")
  plt.tight_layout()
  if n_features == 1:

    plt.title("{} Confusion Matrix: {}".format(model_name, features[0]), y = 1.1)

  elif n_features == 2:

    plt.title("{} Confusion Matrix: {} and {}".format(model_name, features[0],features[1]), y = 1.1)

  elif n_features == 3:

    plt.title("{} Confusion Matrix: {}, {} and {}".format(model_name, features[0],features[1], features[2]), y = 1.1)

  plt.ylabel("Actual label")
  plt.xlabel("Predicted label")

  return confussion_matrix

def compare_models_train_test_split(models, X, y, X_train, y_train, X_test, y_test, features, y_ref):

  '''this fx trains the data on the four types of models,
  generates a report with the overall accuracy of the model, the cross validation evaluation
  and prints the confussion matrix of each model on the particular test_train split.

  It returns a dictionary with the name of the model, the features that were passed during the fit,
  the model used, the accuracy score, the classification report, the crossvalidation evaluation and the confusion matrix
  '''

  model_list_dicts =[]

  for model in models:

    model_name = str(model)[:-2]

    if model_name == "SVC(kernel=\'linear":

      model_name = "SVC"

    model_dict = {}

    ft_str = str(features)[2:-2].replace("'","_").replace(",", "_").replace(" ", "_").replace("___", "_")

    model_dict["name"] = ft_str + "__" + model_name

    model_dict["features"] = features

    model_dict["model"] = model_name

    print("\n",model_name, "\n", "\n")

    model = model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    model_dict["accuracy"] = accuracy

    print("OVERALL ACCURACY", model_name, ":", round(accuracy*100, 2),"%""\n")


    scores = class_report(y_test, y_pred)

    model_dict["report"] = scores

    # print(scores)

    print("\n")

    cv_score, mean_accuracy = compare_models_cross_val(model, X, y)

    model_dict["cv_scores"] = cv_score

    model_dict["mean_cv_accuracy"] = mean_accuracy

    print("\n")

    confussion_matrix = conf_matrix(model_name, features, y_pred, y_test, cmap="magma")

    model_dict["confussion_matrix"] = confussion_matrix

    model_list_dicts.append(model_dict)
    
    joblib.dump(value=[model, features, y_ref], filename=r"C:\Users\alber\Desktop\Make Believe Diciembre\Models\{}.pkl".format(ft_str + "__" + model_name))
  
  with open(r"C:\Users\alber\Desktop\Make Believe Diciembre\Models\Info Dictionaries\{}.json".format(ft_str), "w") as f:
    json.dump(model_list_dicts, f, cls=NumpyArrayEncoder)
  
  
  return model_list_dicts

def magic(data, features):

  '''this function takes all the previous functions and integrates them into a single function to run the data processing, training and testing in all the models.

  It returns a dictionary with the name of the model, the features that were passed during the fit,
  the model used, the accuracy score, the classification report, the crossvalidation evaluation and the confusion matrix

  '''

  data, outcomes = data_preprocess(data) # data in the correct format for the y outcomes

  X, y, X_train, X_test, y_train, y_test, features, y_ref = test_train(data, features) # here we have the data split for all the models with the desired features

  model_list_dicts = compare_models_train_test_split(models, X, y, X_train, y_train, X_test, y_test, features, y_ref) # for each model it will compute

  return model_list_dicts


In [63]:
all_fts = pd.read_csv(r"C:\Users\alber\Desktop\Make Believe Diciembre\all_ling_features.csv")

In [64]:
all_fts

Unnamed: 0,id,DESPC,DESSC,DESWC,DESPL,DESPLd,DESPLw,DESSL,DESSLd,DESWLsy,...,WORD_PROPERTY_WRDHYPn,WORD_PROPERTY_WRDHYPv,WORD_PROPERTY_WRDHYPnv,WORD_PROPERTY_AOA,WORD_PROPERTY_AOA_MAX,WORD_PROPERTY_CONCRETENESS,WORD_PROPERTY_PREVALENCE,WORD_PROPERTY_PREVALENCE_MIN,WORD_SET_INCIDENCE_C4_COMMON_WORDS,Polarity
0,0.000000,0.666667,0.000000,0.187192,0.000000,0.000000,0.103911,0.903743,0.000000,1.000000,...,0.244583,0.283489,0.388215,0.687937,0.396186,0.434696,0.637852,0.637852,0.685198,Fake
1,0.004184,0.333333,0.272727,0.325123,0.125000,0.272727,0.157542,0.331016,0.509944,0.109532,...,0.267995,0.338908,0.076927,0.051748,0.309322,0.676325,0.646687,0.646687,0.252618,Fake
2,0.008368,0.333333,0.363636,0.354680,0.166667,0.363636,0.177654,0.254011,0.442546,0.199330,...,0.471208,0.183434,0.469044,0.238065,0.297608,0.756879,0.615946,0.615946,0.546360,Fake
3,0.012552,0.333333,0.272727,0.394089,0.125000,0.272727,0.197765,0.398396,0.388716,0.191279,...,0.467174,0.177570,0.512643,0.110266,0.147246,0.899076,0.361177,0.361177,0.367094,Fake
4,0.016736,0.333333,0.454545,0.379310,0.208333,0.454545,0.177654,0.195569,0.227758,0.139367,...,0.622575,0.380685,0.427168,0.267677,0.617585,0.241192,0.460505,0.460505,0.550734,Fake
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,0.983264,0.000000,0.176471,0.273810,0.207547,0.176471,0.389978,0.282569,0.164698,0.237409,...,0.506349,0.419364,0.515360,0.293080,0.263963,0.566979,0.568377,0.568377,0.623859,True
476,0.987448,0.857143,0.117647,0.250000,0.015094,0.037203,0.091503,0.380734,0.384460,0.002353,...,0.455246,0.066528,0.234069,0.097540,0.281215,0.426243,0.741069,0.741069,0.387643,True
477,0.991632,0.285714,0.000000,0.103175,0.000000,0.048029,0.129751,0.642202,0.000000,0.353339,...,0.592211,0.281358,0.521021,0.352350,0.582677,0.812008,0.687372,0.687372,0.242686,True
478,0.995816,0.000000,0.470588,0.519841,0.490566,0.352941,0.578794,0.144954,0.189306,0.194554,...,0.492107,0.000000,0.305626,0.095886,0.388076,0.464801,0.698401,0.698401,0.143949,True


In [65]:
all_fts = all_fts.drop(labels="id", axis=1)

In [66]:
all_fts.shape

(480, 63)

In [67]:
features = list(all_fts.columns)
print(features)

['DESPC', 'DESSC', 'DESWC', 'DESPL', 'DESPLd', 'DESPLw', 'DESSL', 'DESSLd', 'DESWLsy', 'DESWLsyd', 'DESWLlt', 'DESWLltd', 'LDTTRc', 'LDTTRa', 'LDMTLD', 'LDHDD', 'SYNLE', 'SYNNP', 'SYNMEDpos', 'SYNMEDwrd', 'SYNMEDlem', 'SYNSTRUTa', 'SYNSTRUTt', 'RDFRE', 'READFKGL', 'TOKEN_ATTRIBUTE_RATIO_ALHPA', 'TOKEN_ATTRIBUTE_RATIO_DIGIT', 'TOKEN_ATTRIBUTE_RATIO_PUNCT', 'TOKEN_ATTRIBUTE_RATIO_URL', 'TOKEN_ATTRIBUTE_RATIO_EMAIL', 'WORD_SET_INCIDENCE_WRDPRP1s', 'WORD_SET_INCIDENCE_WRDPRP1p', 'WORD_SET_INCIDENCE_WRDPRP2', 'WORD_SET_INCIDENCE_WRDPRP3s', 'WORD_SET_INCIDENCE_WRDPRP3p', 'WORD_SET_INCIDENCE_CNCCaus', 'WORD_SET_INCIDENCE_CNCLogic', 'WORD_SET_INCIDENCE_CNCTemp', 'WORD_SET_INCIDENCE_CNCAdd', 'WORD_SET_INCIDENCE_CNCPos', 'WORD_SET_INCIDENCE_CNCNeg', 'WORD_PROPERTY_WRDNOUN', 'WORD_PROPERTY_WRDVERB', 'WORD_PROPERTY_WRDADJ', 'WORD_PROPERTY_WRDADV', 'WORD_PROPERTY_WRDFRQc', 'WORD_PROPERTY_WRDFRQa', 'WORD_PROPERTY_WRDFRQmc', 'WORD_PROPERTY_WRDFAMc', 'WORD_PROPERTY_WRDCNCc', 'WORD_PROPERTY_WRDIMGc', '

In [68]:
nan_dict = {}
for col in all_fts:
    nan_dict[col] = all_fts[col].isna().sum()
print(nan_dict)


{'DESPC': 0, 'DESSC': 0, 'DESWC': 0, 'DESPL': 0, 'DESPLd': 30, 'DESPLw': 0, 'DESSL': 0, 'DESSLd': 0, 'DESWLsy': 0, 'DESWLsyd': 0, 'DESWLlt': 0, 'DESWLltd': 0, 'LDTTRc': 0, 'LDTTRa': 0, 'LDMTLD': 0, 'LDHDD': 0, 'SYNLE': 0, 'SYNNP': 0, 'SYNMEDpos': 0, 'SYNMEDwrd': 0, 'SYNMEDlem': 0, 'SYNSTRUTa': 0, 'SYNSTRUTt': 480, 'RDFRE': 0, 'READFKGL': 0, 'TOKEN_ATTRIBUTE_RATIO_ALHPA': 0, 'TOKEN_ATTRIBUTE_RATIO_DIGIT': 0, 'TOKEN_ATTRIBUTE_RATIO_PUNCT': 0, 'TOKEN_ATTRIBUTE_RATIO_URL': 0, 'TOKEN_ATTRIBUTE_RATIO_EMAIL': 480, 'WORD_SET_INCIDENCE_WRDPRP1s': 0, 'WORD_SET_INCIDENCE_WRDPRP1p': 0, 'WORD_SET_INCIDENCE_WRDPRP2': 0, 'WORD_SET_INCIDENCE_WRDPRP3s': 0, 'WORD_SET_INCIDENCE_WRDPRP3p': 0, 'WORD_SET_INCIDENCE_CNCCaus': 0, 'WORD_SET_INCIDENCE_CNCLogic': 0, 'WORD_SET_INCIDENCE_CNCTemp': 0, 'WORD_SET_INCIDENCE_CNCAdd': 0, 'WORD_SET_INCIDENCE_CNCPos': 0, 'WORD_SET_INCIDENCE_CNCNeg': 0, 'WORD_PROPERTY_WRDNOUN': 0, 'WORD_PROPERTY_WRDVERB': 0, 'WORD_PROPERTY_WRDADJ': 0, 'WORD_PROPERTY_WRDADV': 0, 'WORD_PROPER

In [69]:
for k in nan_dict:
    if nan_dict[k] > 0:
        all_fts = all_fts.drop(labels=k, axis = 1)

data = all_fts

In [70]:
features = list(all_fts.columns)
print(features)

['DESPC', 'DESSC', 'DESWC', 'DESPL', 'DESPLw', 'DESSL', 'DESSLd', 'DESWLsy', 'DESWLsyd', 'DESWLlt', 'DESWLltd', 'LDTTRc', 'LDTTRa', 'LDMTLD', 'LDHDD', 'SYNLE', 'SYNNP', 'SYNMEDpos', 'SYNMEDwrd', 'SYNMEDlem', 'SYNSTRUTa', 'RDFRE', 'READFKGL', 'TOKEN_ATTRIBUTE_RATIO_ALHPA', 'TOKEN_ATTRIBUTE_RATIO_DIGIT', 'TOKEN_ATTRIBUTE_RATIO_PUNCT', 'TOKEN_ATTRIBUTE_RATIO_URL', 'WORD_SET_INCIDENCE_WRDPRP1s', 'WORD_SET_INCIDENCE_WRDPRP1p', 'WORD_SET_INCIDENCE_WRDPRP2', 'WORD_SET_INCIDENCE_WRDPRP3s', 'WORD_SET_INCIDENCE_WRDPRP3p', 'WORD_SET_INCIDENCE_CNCCaus', 'WORD_SET_INCIDENCE_CNCLogic', 'WORD_SET_INCIDENCE_CNCTemp', 'WORD_SET_INCIDENCE_CNCAdd', 'WORD_SET_INCIDENCE_CNCPos', 'WORD_SET_INCIDENCE_CNCNeg', 'WORD_PROPERTY_WRDNOUN', 'WORD_PROPERTY_WRDVERB', 'WORD_PROPERTY_WRDADJ', 'WORD_PROPERTY_WRDADV', 'WORD_PROPERTY_WRDFRQc', 'WORD_PROPERTY_WRDFRQa', 'WORD_PROPERTY_WRDFRQmc', 'WORD_PROPERTY_WRDFAMc', 'WORD_PROPERTY_WRDCNCc', 'WORD_PROPERTY_WRDIMGc', 'WORD_PROPERTY_WRDMEAc', 'WORD_PROPERTY_WRDPOLc', 'WORD

In [75]:
# models = [LogisticRegression()]

models = [KNeighborsClassifier(), RandomForestClassifier()]

# models = [SVC(kernel ="linear")]

# models = [LogisticRegression(), SVC(kernel ="linear"), KNeighborsClassifier(), RandomForestClassifier()]



In [89]:
from sklearn import preprocessing
from sklearn import utils

if data.Polarity[0] != 0 or 1:
  data.Polarity[data.Polarity == 'Fake'] = 0
  data.Polarity[data.Polarity == 'True'] = 1

outcomes = ["Fake","Real"]
print("unique Polairty labels:", data.Polarity.unique())
  
data = all_fts
feature_cols = features

X = data[feature_cols]
y = data.Polarity #outcomes 0 or 1

lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y)

print("Info: {} features were passed at the fit step\n:".format(X.shape[1]))
for feature in feature_cols:
  print(feature)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=16)

model = RandomForestClassifier().fit(X_train, y_train)

y_pred = model.predict(X_test)

unique Polairty labels: [0 1]
Info: 60 features were passed at the fit step
:
DESPC
DESSC
DESWC
DESPL
DESPLw
DESSL
DESSLd
DESWLsy
DESWLsyd
DESWLlt
DESWLltd
LDTTRc
LDTTRa
LDMTLD
LDHDD
SYNLE
SYNNP
SYNMEDpos
SYNMEDwrd
SYNMEDlem
SYNSTRUTa
RDFRE
READFKGL
TOKEN_ATTRIBUTE_RATIO_ALHPA
TOKEN_ATTRIBUTE_RATIO_DIGIT
TOKEN_ATTRIBUTE_RATIO_PUNCT
TOKEN_ATTRIBUTE_RATIO_URL
WORD_SET_INCIDENCE_WRDPRP1s
WORD_SET_INCIDENCE_WRDPRP1p
WORD_SET_INCIDENCE_WRDPRP2
WORD_SET_INCIDENCE_WRDPRP3s
WORD_SET_INCIDENCE_WRDPRP3p
WORD_SET_INCIDENCE_CNCCaus
WORD_SET_INCIDENCE_CNCLogic
WORD_SET_INCIDENCE_CNCTemp
WORD_SET_INCIDENCE_CNCAdd
WORD_SET_INCIDENCE_CNCPos
WORD_SET_INCIDENCE_CNCNeg
WORD_PROPERTY_WRDNOUN
WORD_PROPERTY_WRDVERB
WORD_PROPERTY_WRDADJ
WORD_PROPERTY_WRDADV
WORD_PROPERTY_WRDFRQc
WORD_PROPERTY_WRDFRQa
WORD_PROPERTY_WRDFRQmc
WORD_PROPERTY_WRDFAMc
WORD_PROPERTY_WRDCNCc
WORD_PROPERTY_WRDIMGc
WORD_PROPERTY_WRDMEAc
WORD_PROPERTY_WRDPOLc
WORD_PROPERTY_WRDHYPn
WORD_PROPERTY_WRDHYPv
WORD_PROPERTY_WRDHYPnv
WORD_PROPER

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.Polarity[data.Polarity == 'Fake'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.Polarity[data.Polarity == 'True'] = 1


ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
model_list_dicts =[]

for model in models:
  
  model = model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  model_name = str(model)[:-2]

  if model_name == "SVC(kernel=\'linear":

    model_name = "SVC"

  model_dict = {}

  ft_str = str(features)[2:-2].replace("'","_").replace(",", "_").replace(" ", "_").replace("___", "_")

  model_dict["name"] = ft_str + "__" + model_name

  model_dict["features"] = features

  model_dict["model"] = model_name

  print("\n",model_name, "\n", "\n")

  accuracy = accuracy_score(y_test, y_pred)

  model_dict["accuracy"] = accuracy

  print("OVERALL ACCURACY", model_name, ":", round(accuracy*100, 2),"%""\n")
  
  
  scores = class_report(y_test, y_pred)

  model_dict["report"] = scores

  print("\n")

  
  cv_score = cross_val_score(model, X, y, cv = 5)
  print("Accuracy {} for each of the 5 iterations:".format(model_name))
  for score in cv_score:
  print(round(score*100, 2), "%")

  mean_accuracy = sum(cv_score)/len(cv_score)

  model_dict["cv_scores"] = cv_score

  model_dict["mean_cv_accuracy"] = mean_accuracy

  print("\n")



  print("\nCross validation mean accuracy for {}:".format(model_name), round(mean_accuracy*100, 2),"%")
  print("\n-------------------------------------------------------------------------\n")

  confussion_matrix = conf_matrix(model_name, features, y_pred, y_test, cmap="magma")

  model_dict["confussion_matrix"] = confussion_matrix

  model_list_dicts.append(model_dict)
  
# def class_report(y_test, y_pred):

#   outcomes = ["Fake","Real"]

#   scores = classification_report(y_test, y_pred, target_names=outcomes)

#   print(scores)

#   return scores

# model_name = str(model)[:-2]

# if model_name == "SVC(kernel=\'linear":

#   model_name = "SVC"


In [None]:
# magic(all_fts, features)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.Polarity[data.Polarity == 'Fake'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.Polarity[data.Polarity == 'True'] = 1


Info: 60 features were passed at the fit step
:
DESPC
DESSC
DESWC
DESPL
DESPLw
DESSL
DESSLd
DESWLsy
DESWLsyd
DESWLlt
DESWLltd
LDTTRc
LDTTRa
LDMTLD
LDHDD
SYNLE
SYNNP
SYNMEDpos
SYNMEDwrd
SYNMEDlem
SYNSTRUTa
RDFRE
READFKGL
TOKEN_ATTRIBUTE_RATIO_ALHPA
TOKEN_ATTRIBUTE_RATIO_DIGIT
TOKEN_ATTRIBUTE_RATIO_PUNCT
TOKEN_ATTRIBUTE_RATIO_URL
WORD_SET_INCIDENCE_WRDPRP1s
WORD_SET_INCIDENCE_WRDPRP1p
WORD_SET_INCIDENCE_WRDPRP2
WORD_SET_INCIDENCE_WRDPRP3s
WORD_SET_INCIDENCE_WRDPRP3p
WORD_SET_INCIDENCE_CNCCaus
WORD_SET_INCIDENCE_CNCLogic
WORD_SET_INCIDENCE_CNCTemp
WORD_SET_INCIDENCE_CNCAdd
WORD_SET_INCIDENCE_CNCPos
WORD_SET_INCIDENCE_CNCNeg
WORD_PROPERTY_WRDNOUN
WORD_PROPERTY_WRDVERB
WORD_PROPERTY_WRDADJ
WORD_PROPERTY_WRDADV
WORD_PROPERTY_WRDFRQc
WORD_PROPERTY_WRDFRQa
WORD_PROPERTY_WRDFRQmc
WORD_PROPERTY_WRDFAMc
WORD_PROPERTY_WRDCNCc
WORD_PROPERTY_WRDIMGc
WORD_PROPERTY_WRDMEAc
WORD_PROPERTY_WRDPOLc
WORD_PROPERTY_WRDHYPn
WORD_PROPERTY_WRDHYPv
WORD_PROPERTY_WRDHYPnv
WORD_PROPERTY_AOA
WORD_PROPERTY_AOA_MAX
W

ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
n_features = len(features)

confussion_matrix = confusion_matrix(y_pred, y_test)

outcomes = ["Fake","Real"]
ticks = np.arange(len(outcomes))

fig, ax = plt.subplots()
plt.xticks(ticks, outcomes)
plt.yticks(ticks, outcomes)
sns.heatmap(pd.DataFrame(confussion_matrix), annot=True, cmap=cmap, fmt="g", xticklabels=outcomes, yticklabels=outcomes)
ax.xaxis.set_label_position("top")
plt.tight_layout()


plt.ylabel("Actual label")
plt.xlabel("Predicted label")