# Instagram Fake Account Detection

## Import libraries

In [1]:
from dataset.normalizer import csv_importer_full
from dataset.utils import find_demarcator, shuffle_and_split
from sequoia_comparison.utils import get_scores

import pandas as pd
from sklearn import tree
from sklearn.linear_model import LogisticRegression

## Import data

In [2]:
default_dataset = csv_importer_full("dataset/sources/user_fake_authentic_2class.csv")
idx = find_demarcator(default_dataset)

fake = default_dataset[:idx]
correct = default_dataset[idx:]

Now loading from file dataset/sources/user_fake_authentic_2class.csv...
Loaded 65327 entries from source dataset/sources/user_fake_authentic_2class.csv


## EXPERIMENT 26 - 04

### Macros

In [3]:
N_EXP = 100 # Number of experiments
MAX_ITER = 50000 # Maximum number of iterations for LR

### Functions

Using custom experiment functions not to mess up with the real experiments.

In [4]:
'''
Drop target columns from dataset
'''
def get_custom_dataset(train_df, validation_df, column_names=[]):
    custom_train_df = train_df.drop(column_names, axis=1)
    custom_validation_df = validation_df.drop(column_names, axis=1)

    return custom_train_df, custom_validation_df

In [5]:
'''
column_names: list of columns to drop from default dataset to get custom dataset

modes:
 - "dt" => DecisionTree
 - "lr" => LogisticRegression
'''
def experiment(fake, correct, column_names=[], mode="dt", n_iter=N_EXP):
    avg_scores = {
        'default': {'precision': 0, 'accuracy': 0},
        'custom': {'precision': 0, 'accuracy': 0}
    }

    if mode == "dt":
        print(f"Calculating precision and accuracy metrics for Decision Trees over {n_iter} times")
    elif mode == "lr":
        print(f"Calculating precision and accuracy metrics for Logistic Regression (max_iter={MAX_ITER}) over {n_iter} times")
    else:
        return -1

    for i in range(n_iter):
        # Get new train_df and validation_df, same for default and custom
        train_df, validation_df = shuffle_and_split(fake, correct)
        custom_train_df, custom_validation_df = get_custom_dataset(train_df, validation_df, column_names)

        # Default mode
        if mode == "dt":
            # Get new Decision Tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(train_df.iloc[:, :-2], train_df.iloc[:, -1])
        elif mode == "lr":
            # Get new Logistic Regressor
            clf = LogisticRegression(random_state=0, max_iter=MAX_ITER)
            clf = clf.fit(train_df.iloc[:, :-2], train_df.iloc[:, -1])

        # Get ground truth and predictions to measure performance
        X_val, y_val = validation_df.iloc[:, :-2], validation_df.iloc[:, -1]
        y_pred = clf.predict(X_val)

        # Default scores
        scores = get_scores(y_val, y_pred)
        avg_scores['default']['precision'] += scores['precision']
        avg_scores['default']['accuracy'] += scores['accuracy']

        # Custom mode
        if mode == "dt":
            # Get new Decision Tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(custom_train_df.iloc[:, :-2], custom_train_df.iloc[:, -1])
        elif mode == "lr":
            # Get new Logistic Regressor
            clf = LogisticRegression(random_state=0, max_iter=2500)
            clf = clf.fit(custom_train_df.iloc[:, :-2], custom_train_df.iloc[:, -1])
        else:
            return -1

        # Get ground truth and predictions to measure performance
        X_val, y_val = custom_validation_df.iloc[:, :-2], custom_validation_df.iloc[:, -1]
        y_pred = clf.predict(X_val)

        # Custom scores
        scores = get_scores(y_val, y_pred)
        avg_scores['custom']['precision'] += scores['precision']
        avg_scores['custom']['accuracy'] += scores['accuracy']

        #print(f"{i + 1}/{n_iter}", end="\r")

    # Averaging
    for t in avg_scores.keys():
        for s in avg_scores[t].keys():
            avg_scores[t][s] /= n_iter

    print('Done!\n\n')

    print('default avg precision:', "{:.5f}".format(avg_scores['default']['precision']))
    print('default avg accuracy:', "{:.5f}".format(avg_scores['default']['accuracy']))

    print('custom avg precision:', "{:.5f}".format(avg_scores['custom']['precision']))
    print('custom avg accuracy:', "{:.5f}".format(avg_scores['custom']['accuracy']))


## Evaluate impact upon removing single-attributes

Impact (bad/good) on performance is also evaluated from 1 (very small) to 5 (very big)

In [6]:
print(pd.DataFrame.from_dict(fake).columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


### nmedia - K

In [7]:
# Experiments
experiment(fake, correct, ['nmedia'], "dt")   # DecisionTree
experiment(fake, correct, ['nmedia'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85094
# default avg accuracy: 0.85395
# custom avg precision: 0.84954
# custom avg accuracy: 0.85216
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.81017
# default avg accuracy: 0.79699
# custom avg precision: 0.80927
# custom avg accuracy: 0.79603

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85094
default avg accuracy: 0.85395
custom avg precision: 0.84954
custom avg accuracy: 0.85216
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.81017
default avg accuracy: 0.79699
custom avg precision: 0.80927
custom avg accuracy: 0.79603


CONCLUSION: removing nmedia has a bad impact on performance - KEEP IT !

### flw - K (C - LR)

In [8]:
# Experiments
experiment(fake, correct, ['flw'], "dt")   # DecisionTree
experiment(fake, correct, ['flw'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85141
# default avg accuracy: 0.85402
# custom avg precision: 0.83480
# custom avg accuracy: 0.83765
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80972
# default avg accuracy: 0.79626
# custom avg precision: 0.80990
# custom avg accuracy: 0.79693

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85141
default avg accuracy: 0.85402
custom avg precision: 0.83480
custom avg accuracy: 0.83765
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done!


default avg precision: 0.80972
default avg accuracy: 0.79626
custom avg precision: 0.80990
custom avg accuracy: 0.79693


CONCLUSION: removing flw has a bad impact on performace on DT, but positive on LR - CONSIDER DROPPING IT !

### flg - K

In [9]:
# Experiments
experiment(fake, correct, ['flg'], "dt")   # DecisionTree
experiment(fake, correct, ['flg'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85214
# default avg accuracy: 0.85457
# custom avg precision: 0.80051
# custom avg accuracy: 0.80298
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80903
# default avg accuracy: 0.79640
# custom avg precision: 0.74349
# custom avg accuracy: 0.74912

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85214
default avg accuracy: 0.85457
custom avg precision: 0.80051
custom avg accuracy: 0.80298
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.80903
default avg accuracy: 0.79640
custom avg precision: 0.74349
custom avg accuracy: 0.74912


CONCLUSION: removing flg has a bad impact on performance - KEEP IT !

### biol - K

In [10]:
# Experiments
experiment(fake, correct, ['biol'], "dt")   # DecisionTree
experiment(fake, correct, ['biol'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85076
# default avg accuracy: 0.85367
# custom avg precision: 0.84963
# custom avg accuracy: 0.85216
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80935
# default avg accuracy: 0.79646
# custom avg precision: 0.80896
# custom avg accuracy: 0.79454

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85076
default avg accuracy: 0.85367
custom avg precision: 0.84963
custom avg accuracy: 0.85216
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.80935
default avg accuracy: 0.79646
custom avg precision: 0.80896
custom avg accuracy: 0.79454


CONCLUSION: removing biol has a bad impact on performance - KEEP IT !

### pic - D (C - DT)

In [11]:
# Experiments
experiment(fake, correct, ['pic'], "dt")   # DecisionTree
experiment(fake, correct, ['pic'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85159
# default avg accuracy: 0.85417
# custom avg precision: 0.85190
# custom avg accuracy: 0.85439
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80914
# default avg accuracy: 0.79591
# custom avg precision: 0.80716
# custom avg accuracy: 0.79518

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85159
default avg accuracy: 0.85417
custom avg precision: 0.85190
custom avg accuracy: 0.85439
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.80914
default avg accuracy: 0.79591
custom avg precision: 0.80716
custom avg accuracy: 0.79518


CONCLUSION: removing pic has a positive impact on performances on DT, but bad on LR - CONSIDER DROPPING IT !

### url - K

In [12]:
# Experiments
experiment(fake, correct, ['url'], "dt")   # DecisionTree
experiment(fake, correct, ['url'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85121
# default avg accuracy: 0.85404
# custom avg precision: 0.80325
# custom avg accuracy: 0.80367
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80994
# default avg accuracy: 0.79702
# custom avg precision: 0.78941
# custom avg accuracy: 0.76510

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85121
default avg accuracy: 0.85404
custom avg precision: 0.80325
custom avg accuracy: 0.80367
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Done!


default avg precision: 0.80994
default avg accuracy: 0.79702
custom avg precision: 0.78941
custom avg accuracy: 0.76510


CONCLUSION: removing url has a bad impact on performace - KEEP IT !

### cl - K

In [13]:
# Experiments
experiment(fake, correct, ['cl'], "dt")   # DecisionTree
experiment(fake, correct, ['cl'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85147
# default avg accuracy: 0.85408
# custom avg precision: 0.85019
# custom avg accuracy: 0.85321
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80894
# default avg accuracy: 0.79608
# custom avg precision: 0.80742
# custom avg accuracy: 0.79552

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85147
default avg accuracy: 0.85408
custom avg precision: 0.85019
custom avg accuracy: 0.85321
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.80894
default avg accuracy: 0.79608
custom avg precision: 0.80742
custom avg accuracy: 0.79552


CONCLUSION: removing cl has a BAD impact on performance - KEEP IT !

### cz - D (C - LR/BOTH?)

In [14]:
# Experiments
experiment(fake, correct, ['cz'], "dt")   # DecisionTree
experiment(fake, correct, ['cz'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85135
# default avg accuracy: 0.85380
# custom avg precision: 0.85122
# custom avg accuracy: 0.85381
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80933
# default avg accuracy: 0.79641
# custom avg precision: 0.81296
# custom avg accuracy: 0.79740

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85135
default avg accuracy: 0.85380
custom avg precision: 0.85122
custom avg accuracy: 0.85381
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Done!


default avg precision: 0.80933
default avg accuracy: 0.79641
custom avg precision: 0.81296
custom avg accuracy: 0.79740


CONCLUSION: removing cz has a bad (?) impact on performace on DT, but positive on LR - CONSIDER DROPPING IT !

### ni - K (C - LR)

In [15]:
# Experiments
experiment(fake, correct, ['ni'], "dt")   # DecisionTree
experiment(fake, correct, ['ni'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85114
# default avg accuracy: 0.85380
# custom avg precision: 0.85048
# custom avg accuracy: 0.85331
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80995
# default avg accuracy: 0.79680
# custom avg precision: 0.81205
# custom avg accuracy: 0.79769

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85114
default avg accuracy: 0.85380
custom avg precision: 0.85048
custom avg accuracy: 0.85331
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Done!


default avg precision: 0.80995
default avg accuracy: 0.79680
custom avg precision: 0.81205
custom avg accuracy: 0.79769


CONCLUSION: removing ni has a bad impact on performance on DT, but positive on LR - CONSIDER DROPPING IT !

### erl - K

In [16]:
# Experiments
experiment(fake, correct, ['erl'], "dt")   # DecisionTree
experiment(fake, correct, ['erl'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85144
# default avg accuracy: 0.85371
# custom avg precision: 0.83524
# custom avg accuracy: 0.83825
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80988
# default avg accuracy: 0.79642
# custom avg precision: 0.80894
# custom avg accuracy: 0.79603

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85144
default avg accuracy: 0.85371
custom avg precision: 0.83524
custom avg accuracy: 0.83825
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.80988
default avg accuracy: 0.79642
custom avg precision: 0.80894
custom avg accuracy: 0.79603


CONCLUSION: removing erl has a bad impact on performace - KEEP IT !

### erc - K

In [17]:
# Experiments
experiment(fake, correct, ['erc'], "dt")   # DecisionTree
experiment(fake, correct, ['erc'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85207
# default avg accuracy: 0.85432
# custom avg precision: 0.82484
# custom avg accuracy: 0.82632
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80992
# default avg accuracy: 0.79728
# custom avg precision: 0.80638
# custom avg accuracy: 0.79195

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85207
default avg accuracy: 0.85432
custom avg precision: 0.82484
custom avg accuracy: 0.82632
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.80992
default avg accuracy: 0.79728
custom avg precision: 0.80638
custom avg accuracy: 0.79195


CONCLUSION: removing erc has a bad impact on performace - KEEP IT !

### lt - D (C - DT/BOTH?)

In [18]:
# Experiments
experiment(fake, correct, ['lt'], "dt")   # DecisionTree
experiment(fake, correct, ['lt'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85129
# default avg accuracy: 0.85399
# custom avg precision: 0.85167
# custom avg accuracy: 0.85433
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80996
# default avg accuracy: 0.79657
# custom avg precision: 0.81144
# custom avg accuracy: 0.78659

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85129
default avg accuracy: 0.85399
custom avg precision: 0.85167
custom avg accuracy: 0.85433
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Done!


default avg precision: 0.80996
default avg accuracy: 0.79657
custom avg precision: 0.81144
custom avg accuracy: 0.78659


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


CONCLUSION: removing lt has a positive impact on performace on DT, but bad (?) on LR - CONSIDER DROPING IT !

### ahc - D (C - DT)

In [26]:
# Experiments
experiment(fake, correct, ['ahc'], "dt")   # DecisionTree
experiment(fake, correct, ['ahc'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85075
# default avg accuracy: 0.85389
# custom avg precision: 0.85124
# custom avg accuracy: 0.85410
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80897
# default avg accuracy: 0.79657
# custom avg precision: 0.80818
# custom avg accuracy: 0.79578

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85075
default avg accuracy: 0.85389
custom avg precision: 0.85124
custom avg accuracy: 0.85410
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.80897
default avg accuracy: 0.79657
custom avg precision: 0.80818
custom avg accuracy: 0.79578


CONCLUSION: removing ahc has a positive impact on performace on DT, but bad on LR - CONSIDER DROPPING IT !

### pr - K

In [20]:
# Experiments
experiment(fake, correct, ['pr'], "dt")   # DecisionTree
experiment(fake, correct, ['pr'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85152
# default avg accuracy: 0.85438
# custom avg precision: 0.84928
# custom avg accuracy: 0.85233
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.81019
# default avg accuracy: 0.79716
# custom avg precision: 0.80676
# custom avg accuracy: 0.79327

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85152
default avg accuracy: 0.85438
custom avg precision: 0.84928
custom avg accuracy: 0.85233
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.81019
default avg accuracy: 0.79716
custom avg precision: 0.80676
custom avg accuracy: 0.79327


CONCLUSION: removing pr has a bad impact on performace - KEEP IT !

### fo - K

In [21]:
# Experiments
experiment(fake, correct, ['fo'], "dt")   # DecisionTree
experiment(fake, correct, ['fo'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85148
# default avg accuracy: 0.85402
# custom avg precision: 0.85037
# custom avg accuracy: 0.85301
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80995
# default avg accuracy: 0.79711
# custom avg precision: 0.80904
# custom avg accuracy: 0.79569

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85148
default avg accuracy: 0.85402
custom avg precision: 0.85037
custom avg accuracy: 0.85301
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.80995
default avg accuracy: 0.79711
custom avg precision: 0.80904
custom avg accuracy: 0.79569


CONCLUSION: removing fo has a bad impact on performace - KEEP IT !

### cs - K

In [22]:
# Experiments
experiment(fake, correct, ['cs'], "dt")   # DecisionTree
experiment(fake, correct, ['cs'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85115
# default avg accuracy: 0.85405
# custom avg precision: 0.85046
# custom avg accuracy: 0.85358
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80959
# default avg accuracy: 0.79664
# custom avg precision: 0.78385
# custom avg accuracy: 0.78669

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85115
default avg accuracy: 0.85405
custom avg precision: 0.85046
custom avg accuracy: 0.85358
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Done!


default avg precision: 0.80959
default avg accuracy: 0.79664
custom avg precision: 0.78385
custom avg accuracy: 0.78669


CONCLUSION: removing cs has a small bad impact on performace - KEEP IT !

### avgtime - K

In [27]:
# Experiments
experiment(fake, correct, ['avgtime'], "dt")   # DecisionTree
experiment(fake, correct, ['avgtime'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 100 times
# Done!


# default avg precision: 0.85115
# default avg accuracy: 0.85387
# custom avg precision: 0.85046
# custom avg accuracy: 0.85337
# Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
# Done!


# default avg precision: 0.80965
# default avg accuracy: 0.79644
# custom avg precision: 0.78246
# custom avg accuracy: 0.78587

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85115
default avg accuracy: 0.85387
custom avg precision: 0.85046
custom avg accuracy: 0.85337
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.80965
default avg accuracy: 0.79644
custom avg precision: 0.78246
custom avg accuracy: 0.78587


CONCLUSION: removing flw has a positive impact on performace - KEEP IT !

## Second experiment with custom features

In this experiment we fit (several) Decision Tree Classifier(s) (and Linear Regressors) removing from dataframes the attributes which seemed to worsen performance during the experiments.

In [28]:
# Experiments
experiment(fake, correct, ["flw", "cz", "pic", "ni", "lt", "ahc"], "dt")   # DecisionTree
experiment(fake, correct, ["flw", "pic", "cz", "ni", "lt", "ahc"], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85170
# default avg accuracy: 0.85395
# custom avg precision: 0.83050
# custom avg accuracy: 0.83394
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80960
# default avg accuracy: 0.79711
# custom avg precision: 0.78490
# custom avg accuracy: 0.75939

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85160
default avg accuracy: 0.85414
custom avg precision: 0.83241
custom avg accuracy: 0.83532
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.81018
default avg accuracy: 0.79672
custom avg precision: 0.81582
custom avg accuracy: 0.78348


CONLUSION: IT FAILED !

In this experiment we fit (several) Decision Tree Classifier(s) (and Linear Regressors) removing from dataframes the attributes which seemed to worsen performance (of Decision Trees only ! ) during the experiments.

(HT: LR depends a lot more on MAX_ITER)

In [29]:
# Experiments
experiment(fake, correct, ["pic", "cz", "lt", "ahc"], "dt")   # DecisionTree
experiment(fake, correct, ["pic", "cz", "lt", "ahc"], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85070
# default avg accuracy: 0.85364
# custom avg precision: 0.85122
# custom avg accuracy: 0.85394
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80923
# default avg accuracy: 0.79600
# custom avg precision: 0.77695
# custom avg accuracy: 0.76550

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.85181
default avg accuracy: 0.85407
custom avg precision: 0.85078
custom avg accuracy: 0.85347
Calculating precision and accuracy metrics for Logistic Regression (max_iter=25000) over 100 times
Done!


default avg precision: 0.80964
default avg accuracy: 0.79689
custom avg precision: 0.80868
custom avg accuracy: 0.78385


CONCLUSIONS: removing ni, lt, ahc and avgtime improved DT but worsened LR.