# Instagram Fake Account Detection

## Import libraries

In [1]:
from dataset.normalizer import json_importer_full
from dataset.utils import shuffle_and_split

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn import tree, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from deep.IJECE.IJECE_custom import run_model as run_ijce_custom
from deep.IJECE.IJECE_default import run_model as run_ijce_default
from deep.spz.spz_default import run_model as run_spz_default
from deep.spz.spz_custom import run_model as run_spz_custom

2023-06-29 17:41:51.799886: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-29 17:41:51.840050: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:7630] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-06-29 17:41:51.840083: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-06-29 17:41:51.840103: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-29 17:41:51.847390: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-29 17:41:51.848031: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


In [2]:
tf.keras.utils.disable_interactive_logging()

## Import data

In [3]:
fake = json_importer_full("./dataset/sources/automatedAccountData.json", True)
correct = json_importer_full("./dataset/sources/nonautomatedAccountData.json", False)

Now loading from file ./dataset/sources/automatedAccountData.json...
Loaded 700 entries from source ./dataset/sources/automatedAccountData.json
Now loading from file ./dataset/sources/nonautomatedAccountData.json...
Loaded 700 entries from source ./dataset/sources/nonautomatedAccountData.json


## EXPERIMENT 29 - 06

### Macros

In [4]:
N_EXP = 500 # Number of experiments
MAX_ITER = 50000 # Maximum number of iterations for LR

### Functions

Using custom experiment functions not to mess up with the real experiments.

In [5]:
'''
Drop target columns from dataset
'''
def get_custom_dataset(train_df, validation_df, column_names=[]):
    custom_train_df = train_df.drop(column_names, axis=1)
    custom_validation_df = validation_df.drop(column_names, axis=1)

    return custom_train_df, custom_validation_df

In [6]:
def get_scores(y_val, y_pred):
    scores = {
        'accuracy': 0,
        'precision': 0,
        'recall': 0,
        'f1': 0
    }
    scores['accuracy'] += metrics.accuracy_score(y_val, y_pred)
    scores['precision'] += metrics.precision_score(y_val, y_pred)
    scores['recall'] += metrics.recall_score(y_val, y_pred)
    scores['f1'] += metrics.f1_score(y_val, y_pred)
    return scores

In [7]:
def naive_bayes(x, y_i, y):
    """Naive-Bayes function"""
    p = x[y == y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)


def naive_bayes_support_vector_machine(x, y):
    y = y.values
    r = np.log(naive_bayes(x, 1, y) / naive_bayes(x, 0, y))
    # m = LogisticRegression(C=4, dual=True) # This gives an error
    m = LogisticRegression(C=4, dual=False, max_iter=MAX_ITER)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [8]:
def f1_score(precision, recall):
    return 2 * (precision * recall) / (precision + recall)

In [9]:
def experiment(fake, correct, column_names=[], mode="dt", n_iter=20, demarcator=700):
    '''
    A function which execution an experiment fitting a model `n_iter` times and giving
    back the `avg_scores` for various metrics such as `accuracy`, `precision`, `recall`, ...

    `modes`:
    - `dt` => DecisionTree
    - `lr` => LogisticRegression
    - `nb` => NaiveBayes (NB-SVM, but using LogisticRegression instead)
    - `rf` => RandomForest approach
    - `dl` => DeepLearning approach using neural networks
    '''
    avg_scores = {
        'default': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},
        'custom': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0}
    }

    if mode == "dt":
        print(f"Calculating metrics for Decision Trees over {n_iter} times")
    elif mode == "lr":
        print(f"Calculating metrics for Logistic Regression over {n_iter} times")
    elif mode == "nb":
        print(f"Calculating metrics for Naive Bayes (Logistic Regression) over {n_iter} times")
    elif mode == "rf":
        print(f"Calculating metrics for Random Forests over {n_iter} times")
    elif mode == "dl":
        print(f"Calculating metrics for Deep Learning over {n_iter} times")
    else:
        return -1

    for i in range(n_iter):
        # Get new train_df and validation_df, same for default and custom
        train_df, validation_df = shuffle_and_split(fake, correct)
        custom_train_df, custom_validation_df = get_custom_dataset(train_df, validation_df, column_names)
        # Default mode
        if mode == "dt":
            # Get new Decision Tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(train_df.iloc[:, :-1], train_df.iloc[:, -1])
        elif mode == "lr":
            # Get new Logistic Regressor
            clf = LogisticRegression(random_state=0, max_iter=MAX_ITER)
            clf = clf.fit(train_df.iloc[:, :-1], train_df.iloc[:, -1])
        elif mode == "nb":
            '''
            Here we try using NBSVM (Naive Bayes - Support Vector Machine) but using sklearn's logistic regression rather than SVM,
            although in practice the two are nearly identical. NBSVM was introduced by Sida Wang and Chris Manning in the paper
            [Baselines and Bigrams: Simple, Good Sentiment and Topic Classiﬁcation](https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf).
            '''
            clf, r = naive_bayes_support_vector_machine(train_df.iloc[:, :-1], train_df.iloc[:, -1])
        elif mode == "rf":
            clf = RandomForestClassifier(max_depth=2, random_state=0)
            clf = clf.fit(train_df.iloc[:, :-1], train_df.iloc[:, -1])
        elif mode == "dl":
            print(f"Training default model {i + 1}/{n_iter}      ", end="\r")
            # Get new DL
            clf = run_spz_default(train_df)

        # Get ground truth and predictions to measure performance
        X_val, y_val = validation_df.iloc[:, :-1], validation_df.iloc[:, -1]
        if mode == "dl":
            accuracy = 0
            precision = 0
            recall = 0
            n = 100
            for _ in range(n):
                _, acc, prc, rec = clf.evaluate(x=X_val, y=y_val, verbose=0)
                accuracy += acc
                precision += prc
                recall += rec
            avg_scores['default']['accuracy'] += accuracy / n
            avg_scores['default']['precision'] += precision / n
            avg_scores['default']['recall'] += recall / n
            avg_scores['default']['f1'] += f1_score(precision / n, recall / n)
        else:
            if mode != "nb":
                y_pred = clf.predict(X_val)
            else:
                y_pred = clf.predict(X_val.multiply(r))

            # Default scores
            scores = get_scores(y_val, y_pred)
            avg_scores['default']['accuracy'] += scores['accuracy']
            avg_scores['default']['precision'] += scores['precision']
            avg_scores['default']['recall'] += scores['recall']
            avg_scores['default']['f1'] += scores['f1']

        # Custom mode
        if mode == "dt":
            # Get new Decision Tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(custom_train_df.iloc[:, :-1], custom_train_df.iloc[:, -1])
        elif mode == "lr":
            # Get new Logistic Regressor
            clf = LogisticRegression(random_state=0, max_iter=2500)
            clf = clf.fit(custom_train_df.iloc[:, :-1], custom_train_df.iloc[:, -1])
        elif mode == "nb":
            # Get new Naive Bayes (Logistic Regression)
            clf, r = naive_bayes_support_vector_machine(custom_train_df.iloc[:, :-1], custom_train_df.iloc[:, -1])
        elif mode == "rf":
            clf = RandomForestClassifier(max_depth=2, random_state=0)
            clf = clf.fit(custom_train_df.iloc[:, :-1], custom_train_df.iloc[:, -1])
        elif mode == "dl":
            print(f"Training custom model {i + 1}/{n_iter}      ", end="\r")
            # Get new DL
            clf = run_spz_custom(custom_train_df)

        # Get ground truth and predictions to measure performance
        X_val, y_val = custom_validation_df.iloc[:, :-1], custom_validation_df.iloc[:, -1]
        if mode == "dl":
            accuracy = 0
            precision = 0
            recall = 0
            n = 100
            for _ in range(n):
                _, acc, prc, rec = clf.evaluate(x=X_val, y=y_val, verbose=0)
                accuracy += acc
                precision += prc
                recall += rec
            avg_scores['custom']['accuracy'] += accuracy / n
            avg_scores['custom']['precision'] += precision / n
            avg_scores['custom']['recall'] += recall / n
            avg_scores['custom']['f1'] += f1_score(precision / n, recall / n)
        else:
            if mode != "nb":
                y_pred = clf.predict(X_val)
            else:
                y_pred = clf.predict(X_val.multiply(r))
            # Custom scores
            scores = get_scores(y_val, y_pred)
            avg_scores['custom']['accuracy'] += scores['accuracy']
            avg_scores['custom']['precision'] += scores['precision']
            avg_scores['custom']['recall'] += scores['recall']
            avg_scores['custom']['f1'] += scores['f1']

        print(f"{i + 1}/{n_iter}                            ", end="\r")

    # Averaging
    for t in avg_scores.keys():
        for s in avg_scores[t].keys():
            avg_scores[t][s] /= n_iter

    print('Done!')

    print("Accuracy - Default {:.3f}; Custom {:.3f}".format(avg_scores['default']['accuracy'],
                                                            avg_scores['custom']['accuracy']))
    print("Precision - Default {:.3f}; Custom {:.3f}".format(avg_scores['default']['precision'],
                                                             avg_scores['custom']['precision']))
    print("Recall - Default {:.3f}; Custom {:.3f}".format(avg_scores['default']['recall'],
                                                            avg_scores['custom']['recall']))
    print("F1 - Default {:.3f}; Custom {:.3f}".format(avg_scores['default']['f1'],
                                                            avg_scores['custom']['f1']))
    print("=============================")
    return #avg_scores

## Evaluate impact upon removing single-attributes

Impact (bad/good) on performance is also evaluated from 1 (very small) to 5 (very big)

In [10]:
print(pd.DataFrame.from_dict(fake).columns)

Index(['nmedia', 'biol', 'url', 'nfollowing', 'nfollower', 'mediaLikeNumbers',
       'mediaHashtagNumbers', 'followerToFollowing', 'hasMedia',
       'userHasHighlighReels', 'usernameLength', 'usernameDigitCount', 'fake'],
      dtype='object')


### nmedia (keep)

In [11]:
experiment(fake, correct, ['nmedia'], "dt", N_EXP)   # DecisionTree
experiment(fake, correct, ['nmedia'], "rf", N_EXP)   # RandomForest
#experiment(fake, correct, ['nmedia'], "lr", N_EXP)   # LogisticRegressor
#experiment(fake, correct, ['nmedia'], "nb", N_EXP)   # NaiveBayes (with lr inside)
#experiment(fake, correct, ['nmedia'], "dl", N_EXP)   # NeuralNetwork

Calculating metrics for Decision Trees over 500 times
Done!00                            
Accuracy - Default 0.907; Custom 0.907
Precision - Default 0.908; Custom 0.908
Recall - Default 0.907; Custom 0.906
F1 - Default 0.907; Custom 0.907
Calculating metrics for Random Forests over 500 times
Done!00                            
Accuracy - Default 0.924; Custom 0.922
Precision - Default 0.940; Custom 0.942
Recall - Default 0.907; Custom 0.900
F1 - Default 0.923; Custom 0.920


{'default': {'accuracy': 0.9243080568720435,
  'precision': 0.9397988307100177,
  'recall': 0.9069763033175329,
  'f1': 0.9229456538509285},
 'custom': {'accuracy': 0.921815165876782,
  'precision': 0.9416056549228493,
  'recall': 0.8996872037914648,
  'f1': 0.9199956882724569}}

### biol (keep)

In [12]:
experiment(fake, correct, ['biol'], "dt", N_EXP)   # DecisionTree
experiment(fake, correct, ['biol'], "rf", N_EXP)   # RandomForest
#experiment(fake, correct, ['biol'], "lr", N_EXP)   # LogisticRegressor
#experiment(fake, correct, ['biol'], "nb", N_EXP)   # NaiveBayes (with lr inside)
#experiment(fake, correct, ['biol'], "dl", N_EXP)   # NeuralNetwork

Calculating metrics for Decision Trees over 500 times
Done!00                            
Accuracy - Default 0.907; Custom 0.899
Precision - Default 0.908; Custom 0.899
Recall - Default 0.908; Custom 0.901
F1 - Default 0.907; Custom 0.899
Calculating metrics for Random Forests over 500 times
Done!00                            
Accuracy - Default 0.924; Custom 0.918
Precision - Default 0.939; Custom 0.943
Recall - Default 0.907; Custom 0.890
F1 - Default 0.923; Custom 0.915


{'default': {'accuracy': 0.9239194312796265,
  'precision': 0.939120851129531,
  'recall': 0.9068720379146892,
  'f1': 0.9225674046179021},
 'custom': {'accuracy': 0.9178815165876797,
  'precision': 0.9431410454327989,
  'recall': 0.8896492890995211,
  'f1': 0.9154274776943455}}

### url (drop)

In [13]:
experiment(fake, correct, ['url'], "dt", N_EXP)   # DecisionTree
experiment(fake, correct, ['url'], "rf", N_EXP)   # RandomForest
#experiment(fake, correct, ['url'], "lr", N_EXP)   # LogisticRegressor
#experiment(fake, correct, ['url'], "nb", N_EXP)   # NaiveBayes (with lr inside)
#experiment(fake, correct, ['url'], "dl", N_EXP)   # NeuralNetwork

Calculating metrics for Decision Trees over 500 times
Done!00                            
Accuracy - Default 0.907; Custom 0.907
Precision - Default 0.908; Custom 0.908
Recall - Default 0.907; Custom 0.907
F1 - Default 0.907; Custom 0.907
Calculating metrics for Random Forests over 500 times
Done!00                            
Accuracy - Default 0.925; Custom 0.921
Precision - Default 0.939; Custom 0.941
Recall - Default 0.909; Custom 0.898
F1 - Default 0.924; Custom 0.919


{'default': {'accuracy': 0.924890995260668,
  'precision': 0.9391887565515636,
  'recall': 0.9089289099526032,
  'f1': 0.9236532486408021},
 'custom': {'accuracy': 0.9209383886255956,
  'precision': 0.941131471429088,
  'recall': 0.8983412322274829,
  'f1': 0.9190725368017605}}

### nfollowing (try)

In [14]:
experiment(fake, correct, ['nfollowing'], "dt", N_EXP)   # DecisionTree
experiment(fake, correct, ['nfollowing'], "rf", N_EXP)   # RandomForest
#experiment(fake, correct, ['nfollowing'], "lr", N_EXP)   # LogisticRegressor
#experiment(fake, correct, ['nfollowing'], "nb", N_EXP)   # NaiveBayes (with lr inside)
#experiment(fake, correct, ['nfollowing'], "dl", N_EXP)   # NeuralNetwork

Calculating metrics for Decision Trees over 500 times
Done!00                            
Accuracy - Default 0.907; Custom 0.906
Precision - Default 0.907; Custom 0.906
Recall - Default 0.908; Custom 0.908
F1 - Default 0.907; Custom 0.906
Calculating metrics for Random Forests over 500 times
Done!00                            
Accuracy - Default 0.925; Custom 0.910
Precision - Default 0.939; Custom 0.932
Recall - Default 0.909; Custom 0.884
F1 - Default 0.924; Custom 0.907


{'default': {'accuracy': 0.9249004739336543,
  'precision': 0.9391661731593075,
  'recall': 0.9089194312796186,
  'f1': 0.9236487164524075},
 'custom': {'accuracy': 0.9096255924170576,
  'precision': 0.9320410626093761,
  'recall': 0.8840189573459662,
  'f1': 0.9071909033168793}}

### nfollower (try)

In [15]:
experiment(fake, correct, ['nfollower'], "dt", N_EXP)   # DecisionTree
experiment(fake, correct, ['nfollower'], "rf", N_EXP)   # RandomForest
#experiment(fake, correct, ['nfollower'], "lr", N_EXP)   # LogisticRegressor
#experiment(fake, correct, ['nfollower'], "nb", N_EXP)   # NaiveBayes (with lr inside)
#experiment(fake, correct, ['nfollower'], "dl", N_EXP)   # NeuralNetwork

Calculating metrics for Decision Trees over 500 times
Done!00                            
Accuracy - Default 0.907; Custom 0.907
Precision - Default 0.907; Custom 0.910
Recall - Default 0.909; Custom 0.903
F1 - Default 0.908; Custom 0.906
Calculating metrics for Random Forests over 500 times
Done!00                            
Accuracy - Default 0.925; Custom 0.921
Precision - Default 0.940; Custom 0.934
Recall - Default 0.909; Custom 0.906
F1 - Default 0.924; Custom 0.920


{'default': {'accuracy': 0.9250568720379199,
  'precision': 0.9397678421324944,
  'recall': 0.9086350710900447,
  'f1': 0.9237760803341596},
 'custom': {'accuracy': 0.9209336492891036,
  'precision': 0.9341237188838657,
  'recall': 0.9060568720379116,
  'f1': 0.9196987050382633}}

### mediaLikeNumbers (drop)

In [16]:
experiment(fake, correct, ['mediaLikeNumbers'], "dt", N_EXP)   # DecisionTree
experiment(fake, correct, ['mediaLikeNumbers'], "rf", N_EXP)   # RandomForest
#experiment(fake, correct, ['mediaLikeNumbers'], "lr", N_EXP)   # LogisticRegressor
#experiment(fake, correct, ['mediaLikeNumbers'], "nb", N_EXP)   # NaiveBayes (with lr inside)
#experiment(fake, correct, ['mediaLikeNumbers'], "dl", N_EXP)   # NeuralNetwork

Calculating metrics for Decision Trees over 500 times
Done!00                            
Accuracy - Default 0.906; Custom 0.909
Precision - Default 0.907; Custom 0.908
Recall - Default 0.906; Custom 0.910
F1 - Default 0.906; Custom 0.909
Calculating metrics for Random Forests over 500 times
Done!00                            
Accuracy - Default 0.925; Custom 0.925
Precision - Default 0.941; Custom 0.942
Recall - Default 0.908; Custom 0.905
F1 - Default 0.924; Custom 0.923


{'default': {'accuracy': 0.9251279620853143,
  'precision': 0.9406578640037537,
  'recall': 0.9077914691943099,
  'f1': 0.9237723148832673},
 'custom': {'accuracy': 0.9245355450237023,
  'precision': 0.9422991244031071,
  'recall': 0.9047109004739317,
  'f1': 0.9229564958208265}}

### mediaHashtagNumbers (keep)

In [17]:
experiment(fake, correct, ['mediaHashtagNumbers'], "dt", N_EXP)   # DecisionTree
experiment(fake, correct, ['mediaHashtagNumbers'], "rf", N_EXP)   # RandomForest
#experiment(fake, correct, ['mediaHashtagNumbers'], "lr", N_EXP)   # LogisticRegressor
#experiment(fake, correct, ['mediaHashtagNumbers'], "nb", N_EXP)   # NaiveBayes (with lr inside)
#experiment(fake, correct, ['mediaHashtagNumbers'], "dl", N_EXP)   # NeuralNetwork

Calculating metrics for Decision Trees over 500 times
Done!00                            
Accuracy - Default 0.906; Custom 0.901
Precision - Default 0.906; Custom 0.901
Recall - Default 0.907; Custom 0.902
F1 - Default 0.906; Custom 0.901
Calculating metrics for Random Forests over 500 times
Done!00                            
Accuracy - Default 0.924; Custom 0.916
Precision - Default 0.939; Custom 0.939
Recall - Default 0.908; Custom 0.889
F1 - Default 0.923; Custom 0.913


{'default': {'accuracy': 0.9244123222748862,
  'precision': 0.9389795752089016,
  'recall': 0.9080947867298556,
  'f1': 0.9231174385378533},
 'custom': {'accuracy': 0.9155213270142183,
  'precision': 0.9387357781608782,
  'recall': 0.8894123222748767,
  'f1': 0.9132011073098545}}

### followerToFollowing (keep)

In [18]:
experiment(fake, correct, ['followerToFollowing'], "dt", N_EXP)   # DecisionTree
experiment(fake, correct, ['followerToFollowing'], "rf", N_EXP)   # RandomForest
#experiment(fake, correct, ['followerToFollowing'], "lr", N_EXP)   # LogisticRegressor
#experiment(fake, correct, ['followerToFollowing'], "nb", N_EXP)   # NaiveBayes (with lr inside)
#experiment(fake, correct, ['followerToFollowing'], "dl", N_EXP)   # NeuralNetwork

Calculating metrics for Decision Trees over 500 times
Done!00                            
Accuracy - Default 0.907; Custom 0.898
Precision - Default 0.907; Custom 0.899
Recall - Default 0.907; Custom 0.897
F1 - Default 0.907; Custom 0.898
Calculating metrics for Random Forests over 500 times
Done!00                            
Accuracy - Default 0.925; Custom 0.921
Precision - Default 0.939; Custom 0.940
Recall - Default 0.908; Custom 0.900
F1 - Default 0.923; Custom 0.919


{'default': {'accuracy': 0.9245023696682508,
  'precision': 0.9391227980083509,
  'recall': 0.9081611374407548,
  'f1': 0.9232228937043447},
 'custom': {'accuracy': 0.9209763033175385,
  'precision': 0.9398910770710244,
  'recall': 0.8997914691943087,
  'f1': 0.9192322053885508}}

### hasMedia (try)

In [19]:
experiment(fake, correct, ['hasMedia'], "dt", N_EXP)   # DecisionTree
experiment(fake, correct, ['hasMedia'], "rf", N_EXP)   # RandomForest
#experiment(fake, correct, ['hasMedia'], "lr", N_EXP)   # LogisticRegressor
#experiment(fake, correct, ['hasMedia'], "nb", N_EXP)   # NaiveBayes (with lr inside)
#experiment(fake, correct, ['hasMedia'], "dl", N_EXP)   # NeuralNetwork

Calculating metrics for Decision Trees over 500 times
Done!00                            
Accuracy - Default 0.907; Custom 0.907
Precision - Default 0.906; Custom 0.906
Recall - Default 0.909; Custom 0.909
F1 - Default 0.907; Custom 0.907
Calculating metrics for Random Forests over 500 times
Done!00                            
Accuracy - Default 0.924; Custom 0.921
Precision - Default 0.938; Custom 0.942
Recall - Default 0.909; Custom 0.898
F1 - Default 0.923; Custom 0.919


{'default': {'accuracy': 0.9242985781990571,
  'precision': 0.9380340296188371,
  'recall': 0.9089004739336473,
  'f1': 0.9230730221678884},
 'custom': {'accuracy': 0.921061611374411,
  'precision': 0.941518092409157,
  'recall': 0.8981990521326977,
  'f1': 0.9191524580168394}}

### userHasHighlighReels (try)

In [20]:
experiment(fake, correct, ['userHasHighlighReels'], "dt", N_EXP)   # DecisionTree
experiment(fake, correct, ['userHasHighlighReels'], "rf", N_EXP)   # RandomForest
#experiment(fake, correct, ['userHasHighlighReels'], "lr", N_EXP)   # LogisticRegressor
#experiment(fake, correct, ['userHasHighlighReels'], "nb", N_EXP)   # NaiveBayes (with lr inside)
#experiment(fake, correct, ['userHasHighlighReels'], "dl", N_EXP)   # NeuralNetwork

Calculating metrics for Decision Trees over 500 times
Done!00                            
Accuracy - Default 0.907; Custom 0.908
Precision - Default 0.908; Custom 0.908
Recall - Default 0.907; Custom 0.908
F1 - Default 0.907; Custom 0.908
Calculating metrics for Random Forests over 500 times
Done!00                            
Accuracy - Default 0.925; Custom 0.921
Precision - Default 0.940; Custom 0.944
Recall - Default 0.908; Custom 0.897
F1 - Default 0.923; Custom 0.919


{'default': {'accuracy': 0.9246919431279671,
  'precision': 0.939924420294778,
  'recall': 0.9076777251184802,
  'f1': 0.9233673803320884},
 'custom': {'accuracy': 0.9214312796208564,
  'precision': 0.9435167909745227,
  'recall': 0.8968246445497583,
  'f1': 0.9194069726035731}}

### usernameLength (drop)

In [21]:
experiment(fake, correct, ['usernameLength'], "dt", N_EXP)   # DecisionTree
experiment(fake, correct, ['usernameLength'], "rf", N_EXP)   # RandomForest
#experiment(fake, correct, ['usernameLength'], "lr", N_EXP)   # LogisticRegressor
#experiment(fake, correct, ['usernameLength'], "nb", N_EXP)   # NaiveBayes (with lr inside)
#experiment(fake, correct, ['usernameLength'], "dl", N_EXP)   # NeuralNetwork

Calculating metrics for Decision Trees over 500 times
Done!00                            
Accuracy - Default 0.907; Custom 0.910
Precision - Default 0.907; Custom 0.910
Recall - Default 0.907; Custom 0.912
F1 - Default 0.907; Custom 0.911
Calculating metrics for Random Forests over 500 times
Done!00                            
Accuracy - Default 0.925; Custom 0.921
Precision - Default 0.940; Custom 0.942
Recall - Default 0.908; Custom 0.898
F1 - Default 0.923; Custom 0.919


{'default': {'accuracy': 0.9247156398104317,
  'precision': 0.9398588586530856,
  'recall': 0.9078009478672957,
  'f1': 0.9233844016380491},
 'custom': {'accuracy': 0.9210379146919462,
  'precision': 0.942017525284914,
  'recall': 0.8975829383886216,
  'f1': 0.9190857764516015}}

### usernameDigitCount (drop)

In [22]:
experiment(fake, correct, ['usernameDigitCount'], "dt", N_EXP)   # DecisionTree
experiment(fake, correct, ['usernameDigitCount'], "rf", N_EXP)   # RandomForest
#experiment(fake, correct, ['usernameDigitCount'], "lr", N_EXP)   # LogisticRegressor
#experiment(fake, correct, ['usernameDigitCount'], "nb", N_EXP)   # NaiveBayes (with lr inside)
#experiment(fake, correct, ['usernameDigitCount'], "dl", N_EXP)   # NeuralNetwork

Calculating metrics for Decision Trees over 500 times
Done!00                            
Accuracy - Default 0.907; Custom 0.908
Precision - Default 0.907; Custom 0.908
Recall - Default 0.907; Custom 0.909
F1 - Default 0.907; Custom 0.908
Calculating metrics for Random Forests over 500 times
Done!00                            
Accuracy - Default 0.924; Custom 0.921
Precision - Default 0.939; Custom 0.942
Recall - Default 0.907; Custom 0.898
F1 - Default 0.922; Custom 0.919


{'default': {'accuracy': 0.9238246445497681,
  'precision': 0.9389156413922014,
  'recall': 0.9068909952606594,
  'f1': 0.9224792819321779},
 'custom': {'accuracy': 0.9212369668246478,
  'precision': 0.9416549483670255,
  'recall': 0.898407582938383,
  'f1': 0.9193520469604484}}