# Instagram Fake Account Detection

## Import libraries

In [1]:
from dataset.normalizer import csv_importer_full
from dataset.utils import find_demarcator, shuffle_and_split
from sequoia_comparison.utils import get_scores

import pandas as pd
from sklearn import tree
from sklearn.linear_model import LogisticRegression

## Import data

In [2]:
default_dataset = csv_importer_full("dataset/sources/user_fake_authentic_2class.csv")
idx = find_demarcator(default_dataset)

fake = default_dataset[:idx]
correct = default_dataset[idx:]

Now loading from file dataset/sources/user_fake_authentic_2class.csv...
Loaded 65327 entries from source dataset/sources/user_fake_authentic_2class.csv


## EXPERIMENT 26 - 04

### Macros

In [3]:
N_EXP = 50 # Number of experiments
MAX_ITER = 25000 # Maximum number of iterations for LR

### Functions

Using custom experiment functions not to mess up with the real experiments.

In [4]:
'''
Drop target columns from dataset
'''
def get_custom_dataset(train_df, validation_df, column_names=[]):
    custom_train_df = train_df.drop(column_names, axis=1)
    custom_validation_df = validation_df.drop(column_names, axis=1)

    return custom_train_df, custom_validation_df

In [5]:
'''
column_names: list of columns to drop from default dataset to get custom dataset

modes:
 - "dt" => DecisionTree
 - "lr" => LogisticRegression
'''
def experiment(fake, correct, column_names=[], mode="dt", n_iter=N_EXP):
    avg_scores = {
        'default': {'precision': 0, 'accuracy': 0},
        'custom': {'precision': 0, 'accuracy': 0}
    }

    if mode == "dt":
        print(f"Calculating precision and accuracy metrics for Decision Trees over {n_iter} times")
    elif mode == "lr":
        print(f"Calculating precision and accuracy metrics for Logistic Regression (max_iter={MAX_ITER}) over {n_iter} times")
    else:
        return -1

    for i in range(n_iter):
        # Get new train_df and validation_df, same for default and custom
        train_df, validation_df = shuffle_and_split(fake, correct)
        custom_train_df, custom_validation_df = get_custom_dataset(train_df, validation_df, column_names)

        # Default mode
        if mode == "dt":
            # Get new Decision Tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(train_df.iloc[:, :-2], train_df.iloc[:, -1])
        elif mode == "lr":
            # Get new Logistic Regressor
            clf = LogisticRegression(random_state=0, max_iter=MAX_ITER)
            clf = clf.fit(train_df.iloc[:, :-2], train_df.iloc[:, -1])

        # Get ground truth and predictions to measure performance
        X_val, y_val = validation_df.iloc[:, :-2], validation_df.iloc[:, -1]
        y_pred = clf.predict(X_val)

        # Default scores
        scores = get_scores(y_val, y_pred)
        avg_scores['default']['precision'] += scores['precision']
        avg_scores['default']['accuracy'] += scores['accuracy']

        # Custom mode
        if mode == "dt":
            # Get new Decision Tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(custom_train_df.iloc[:, :-2], custom_train_df.iloc[:, -1])
        elif mode == "lr":
            # Get new Logistic Regressor
            clf = LogisticRegression(random_state=0, max_iter=2500)
            clf = clf.fit(custom_train_df.iloc[:, :-2], custom_train_df.iloc[:, -1])
        else:
            return -1

        # Get ground truth and predictions to measure performance
        X_val, y_val = custom_validation_df.iloc[:, :-2], custom_validation_df.iloc[:, -1]
        y_pred = clf.predict(X_val)

        # Custom scores
        scores = get_scores(y_val, y_pred)
        avg_scores['custom']['precision'] += scores['precision']
        avg_scores['custom']['accuracy'] += scores['accuracy']

        #print(f"{i + 1}/{n_iter}", end="\r")

    # Averaging
    for t in avg_scores.keys():
        for s in avg_scores[t].keys():
            avg_scores[t][s] /= n_iter

    print('Done!\n\n')

    print('default avg precision:', "{:.5f}".format(avg_scores['default']['precision']))
    print('default avg accuracy:', "{:.5f}".format(avg_scores['default']['accuracy']))

    print('custom avg precision:', "{:.5f}".format(avg_scores['custom']['precision']))
    print('custom avg accuracy:', "{:.5f}".format(avg_scores['custom']['accuracy']))


## Evaluate impact upon removing single-attributes

Impact (bad/good) on performance is also evaluated from 1 (very small) to 5 (very big)

In [6]:
print(pd.DataFrame.from_dict(fake).columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


### nmedia - K

In [7]:
# Experiments
experiment(fake, correct, ['nmedia'], "dt")   # DecisionTree
experiment(fake, correct, ['nmedia'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85107
# default avg accuracy: 0.85409
# custom avg precision: 0.85007
# custom avg accuracy: 0.85256
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80997
# default avg accuracy: 0.79701
# custom avg precision: 0.80917
# custom avg accuracy: 0.79575

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85107
default avg accuracy: 0.85409
custom avg precision: 0.85007
custom avg accuracy: 0.85256
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.80997
default avg accuracy: 0.79701
custom avg precision: 0.80917
custom avg accuracy: 0.79575


CONCLUSION: removing nmedia has a bad (1) impact on performance - KEEP IT !

### flw - K (C)

In [8]:
# Experiments
experiment(fake, correct, ['flw'], "dt")   # DecisionTree
experiment(fake, correct, ['flw'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85124
# default avg accuracy: 0.85425
# custom avg precision: 0.83437
# custom avg accuracy: 0.83749
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80913
# default avg accuracy: 0.79664
# custom avg precision: 0.81189
# custom avg accuracy: 0.79758

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85124
default avg accuracy: 0.85425
custom avg precision: 0.83437
custom avg accuracy: 0.83749
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.80913
default avg accuracy: 0.79664
custom avg precision: 0.81189
custom avg accuracy: 0.79758


CONCLUSION: removing flw has a bad (3) impact on performace (on DT, but positive (1) on LR) - CONSIDER DROPPING IT !

### flg - K

In [9]:
# Experiments
experiment(fake, correct, ['flg'], "dt")   # DecisionTree
experiment(fake, correct, ['flg'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85122
# default avg accuracy: 0.85372
# custom avg precision: 0.80139
# custom avg accuracy: 0.80334
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80990
# default avg accuracy: 0.79647
# custom avg precision: 0.74312
# custom avg accuracy: 0.74923

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85122
default avg accuracy: 0.85372
custom avg precision: 0.80139
custom avg accuracy: 0.80334
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.80990
default avg accuracy: 0.79647
custom avg precision: 0.74312
custom avg accuracy: 0.74923


CONCLUSION: removing flg has a bad (5) impact on performance - KEEP IT !

### biol - K

In [10]:
# Experiments
experiment(fake, correct, ['biol'], "dt")   # DecisionTree
experiment(fake, correct, ['biol'], "lr")   # LogisticRegression

# LOG
# default avg precision: 0.85121
# default avg accuracy: 0.85384
# custom avg precision: 0.84994
# custom avg accuracy: 0.85216
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.81020
# default avg accuracy: 0.79692
# custom avg precision: 0.80951
# custom avg accuracy: 0.79434

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85121
default avg accuracy: 0.85384
custom avg precision: 0.84994
custom avg accuracy: 0.85216
Calculating precision and accuracy metrics for Logistic Regression over 50 times


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done!


default avg precision: 0.81020
default avg accuracy: 0.79692
custom avg precision: 0.80951
custom avg accuracy: 0.79434


CONCLUSION: removing biol has a bad (2) impact on performance - KEEP IT !

### pic - K

In [11]:
# Experiments
experiment(fake, correct, ['pic'], "dt")   # DecisionTree
experiment(fake, correct, ['pic'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85187
# default avg accuracy: 0.85412
# custom avg precision: 0.85146
# custom avg accuracy: 0.85373
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80933
# default avg accuracy: 0.79695
# custom avg precision: 0.80736
# custom avg accuracy: 0.79608

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85187
default avg accuracy: 0.85412
custom avg precision: 0.85146
custom avg accuracy: 0.85373
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.80933
default avg accuracy: 0.79695
custom avg precision: 0.80736
custom avg accuracy: 0.79608


CONCLUSION: removing pic has a bad (1) impact on performances - KEEP IT !

### url - K

In [12]:
# Experiments
experiment(fake, correct, ['url'], "dt")   # DecisionTree
experiment(fake, correct, ['url'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85193
# default avg accuracy: 0.85450
# custom avg precision: 0.80305
# custom avg accuracy: 0.80322
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80950
# default avg accuracy: 0.79689
# custom avg precision: 0.78846
# custom avg accuracy: 0.76446

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85193
default avg accuracy: 0.85450
custom avg precision: 0.80305
custom avg accuracy: 0.80322
Calculating precision and accuracy metrics for Logistic Regression over 50 times


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Done!


default avg precision: 0.80950
default avg accuracy: 0.79689
custom avg precision: 0.78846
custom avg accuracy: 0.76446


CONCLUSION: removing url has a bad (5) impact on performace - KEEP IT !

### cl - K

In [13]:
# Experiments
experiment(fake, correct, ['cl'], "dt")   # DecisionTree
experiment(fake, correct, ['cl'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85068
# default avg accuracy: 0.85394
# custom avg precision: 0.84934
# custom avg accuracy: 0.85289
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.81023
# default avg accuracy: 0.79724
# custom avg precision: 0.80808
# custom avg accuracy: 0.79600

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85068
default avg accuracy: 0.85394
custom avg precision: 0.84934
custom avg accuracy: 0.85289
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.81023
default avg accuracy: 0.79724
custom avg precision: 0.80808
custom avg accuracy: 0.79600


CONCLUSION: removing cl has a BAD (2) impact on performance - KEEP IT !

### cz - K (C)

In [25]:
# Experiments
experiment(fake, correct, ['cz'], "dt")   # DecisionTree
experiment(fake, correct, ['cz'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85197
# default avg accuracy: 0.85413
# custom avg precision: 0.85172
# custom avg accuracy: 0.85401
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.81044
# default avg accuracy: 0.79701
# custom avg precision: 0.81327
# custom avg accuracy: 0.79789

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85197
default avg accuracy: 0.85413
custom avg precision: 0.85172
custom avg accuracy: 0.85401
Calculating precision and accuracy metrics for Logistic Regression over 50 times


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done!


default avg precision: 0.81044
default avg accuracy: 0.79701
custom avg precision: 0.81327
custom avg accuracy: 0.79789


CONCLUSION: removing cz has a bad (1) impact on performace (on DT, but positive (1) on LR) - CONSIDER DROPPING IT !

### ni - D (C)

In [15]:
# Experiments
experiment(fake, correct, ['ni'], "dt")   # DecisionTree
experiment(fake, correct, ['ni'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85151
# default avg accuracy: 0.85431
# custom avg precision: 0.85176
# custom avg accuracy: 0.85452
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80882
# default avg accuracy: 0.79635
# custom avg precision: 0.81073
# custom avg accuracy: 0.79713

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85151
default avg accuracy: 0.85431
custom avg precision: 0.85176
custom avg accuracy: 0.85452
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.80882
default avg accuracy: 0.79635
custom avg precision: 0.81073
custom avg accuracy: 0.79713


CONCLUSION: removing ni has a positive (1) impact on performance - CONSIDER DROPPING IT !

### erl - K

In [16]:
# Experiments
experiment(fake, correct, ['erl'], "dt")   # DecisionTree
experiment(fake, correct, ['erl'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85094
# default avg accuracy: 0.85421
# custom avg precision: 0.83422
# custom avg accuracy: 0.83776
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80922
# default avg accuracy: 0.79638
# custom avg precision: 0.80857
# custom avg accuracy: 0.79596

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85094
default avg accuracy: 0.85421
custom avg precision: 0.83422
custom avg accuracy: 0.83776
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.80922
default avg accuracy: 0.79638
custom avg precision: 0.80857
custom avg accuracy: 0.79596


CONCLUSION: removing erl has a bad (3) impact on performace - KEEP IT !

### erc - K

In [17]:
# Experiments
experiment(fake, correct, ['erc'], "dt")   # DecisionTree
experiment(fake, correct, ['erc'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85152
# default avg accuracy: 0.85438
# custom avg precision: 0.82527
# custom avg accuracy: 0.82676
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80965
# default avg accuracy: 0.79613
# custom avg precision: 0.80518
# custom avg accuracy: 0.79008

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85152
default avg accuracy: 0.85438
custom avg precision: 0.82527
custom avg accuracy: 0.82676
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.80965
default avg accuracy: 0.79613
custom avg precision: 0.80518
custom avg accuracy: 0.79008


CONCLUSION: removing erc has a bad (3) impact on performace - KEEP IT !

### lt - D (C)

In [18]:
# Experiments
experiment(fake, correct, ['lt'], "dt")   # DecisionTree
experiment(fake, correct, ['lt'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85046
# default avg accuracy: 0.85334
# custom avg precision: 0.85062
# custom avg accuracy: 0.85359
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80949
# default avg accuracy: 0.79639
# custom avg precision: 0.81202
# custom avg accuracy: 0.78764

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85046
default avg accuracy: 0.85334
custom avg precision: 0.85062
custom avg accuracy: 0.85359
Calculating precision and accuracy metrics for Logistic Regression over 50 times


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Done!


default avg precision: 0.80949
default avg accuracy: 0.79639
custom avg precision: 0.81202
custom avg accuracy: 0.78764


CONCLUSION: removing lt has a positive impact on performace (on DT, but bad (1) on LR) - CONSIDER DROPING IT !

### ahc - D (C)

In [26]:
# Experiments
experiment(fake, correct, ['ahc'], "dt")   # DecisionTree
experiment(fake, correct, ['ahc'], "lr")   # LogisticRegression

#LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85112
# default avg accuracy: 0.85402
# custom avg precision: 0.85137
# custom avg accuracy: 0.85406
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.81024
# default avg accuracy: 0.79684
# custom avg precision: 0.80922
# custom avg accuracy: 0.79540

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85112
default avg accuracy: 0.85402
custom avg precision: 0.85137
custom avg accuracy: 0.85406
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.81024
default avg accuracy: 0.79684
custom avg precision: 0.80922
custom avg accuracy: 0.79540


CONCLUSION: removing ahc has a positive (1) impact on performace (on DT, but bad (1) on LR) - CONSIDER DROPPING IT !

### pr - K

In [20]:
# Experiments
experiment(fake, correct, ['pr'], "dt")   # DecisionTree
experiment(fake, correct, ['pr'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85159
# default avg accuracy: 0.85431
# custom avg precision: 0.84982
# custom avg accuracy: 0.85242
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.81059
# default avg accuracy: 0.79741
# custom avg precision: 0.80748
# custom avg accuracy: 0.79406

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85159
default avg accuracy: 0.85431
custom avg precision: 0.84982
custom avg accuracy: 0.85242
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.81059
default avg accuracy: 0.79741
custom avg precision: 0.80748
custom avg accuracy: 0.79406


CONCLUSION: removing pr has a bad (1) impact on performace - KEEP IT !

### fo - K

In [21]:
# Experiments
experiment(fake, correct, ['fo'], "dt")   # DecisionTree
experiment(fake, correct, ['fo'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.851
# default avg accuracy: 0.854
# custom avg precision: 0.850
# custom avg accuracy: 0.853
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.809
# default avg accuracy: 0.797
# custom avg precision: 0.808
# custom avg accuracy: 0.795

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85093
default avg accuracy: 0.85436
custom avg precision: 0.84995
custom avg accuracy: 0.85327
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.81069
default avg accuracy: 0.79696
custom avg precision: 0.80898
custom avg accuracy: 0.79585


CONCLUSION: removing fo has a bad (1) impact on performace - KEEP IT !

### cs - K

In [22]:
# Experiments
experiment(fake, correct, ['cs'], "dt")   # DecisionTree
experiment(fake, correct, ['cs'], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85129
# default avg accuracy: 0.85387
# custom avg precision: 0.85058
# custom avg accuracy: 0.85319
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.81070
# default avg accuracy: 0.79618
# custom avg precision: 0.78299
# custom avg accuracy: 0.78704

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85129
default avg accuracy: 0.85387
custom avg precision: 0.85058
custom avg accuracy: 0.85319
Calculating precision and accuracy metrics for Logistic Regression over 50 times


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Done!


default avg precision: 0.81070
default avg accuracy: 0.79618
custom avg precision: 0.78299
custom avg accuracy: 0.78704


CONCLUSION: removing cs has a small bad (2) impact on performace - KEEP IT !

### avgtime - D (C)

In [23]:
# Experiments
experiment(fake, correct, ['avgtime'], "dt")   # DecisionTree
experiment(fake, correct, ['avgtime'], "lr")   # LogisticRegression

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85033
default avg accuracy: 0.85358
custom avg precision: 0.85069
custom avg accuracy: 0.85380
Calculating precision and accuracy metrics for Logistic Regression over 50 times


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done!


default avg precision: 0.80966
default avg accuracy: 0.79600
custom avg precision: 0.78179
custom avg accuracy: 0.78733


CONCLUSION: removing flw has a positive (1) impact on performace (on DT, but bad on LR) - CONSIDER DROPPING IT !

## Second experiment with custom features

In this experiment we fit (several) Decision Tree Classifier(s) (and Linear Regressors) removing from dataframes the attributes which seemed to worsen performance during the experiments.

In [27]:
# Experiments
experiment(fake, correct, ["flw", "cz", "ni", "lt", "ahc", "avgtime"], "dt")   # DecisionTree
experiment(fake, correct, ["flw", "cz", "ni", "lt", "ahc", "avgtime"], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85170
# default avg accuracy: 0.85395
# custom avg precision: 0.83050
# custom avg accuracy: 0.83394
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80960
# default avg accuracy: 0.79711
# custom avg precision: 0.78490
# custom avg accuracy: 0.75939

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85170
default avg accuracy: 0.85395
custom avg precision: 0.83050
custom avg accuracy: 0.83394
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.80960
default avg accuracy: 0.79711
custom avg precision: 0.78490
custom avg accuracy: 0.75939


CONLUSION: IT FAILED !

In this experiment we fit (several) Decision Tree Classifier(s) (and Linear Regressors) removing from dataframes the attributes which seemed to worsen performance (of Decision Trees only ! ) during the experiments.

(HT: LR depends a lot more on MAX_ITER)

In [28]:
# Experiments
experiment(fake, correct, ["ni", "lt", "ahc", "avgtime"], "dt")   # DecisionTree
experiment(fake, correct, ["ni", "lt", "ahc", "avgtime"], "lr")   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 50 times
# Done!


# default avg precision: 0.85070
# default avg accuracy: 0.85364
# custom avg precision: 0.85122
# custom avg accuracy: 0.85394
# Calculating precision and accuracy metrics for Logistic Regression over 50 times
# Done!


# default avg precision: 0.80923
# default avg accuracy: 0.79600
# custom avg precision: 0.77695
# custom avg accuracy: 0.76550

Calculating precision and accuracy metrics for Decision Trees over 50 times
Done!


default avg precision: 0.85070
default avg accuracy: 0.85364
custom avg precision: 0.85122
custom avg accuracy: 0.85394
Calculating precision and accuracy metrics for Logistic Regression over 50 times
Done!


default avg precision: 0.80923
default avg accuracy: 0.79600
custom avg precision: 0.77695
custom avg accuracy: 0.76550


CONCLUSIONS: removing ni, lt, ahc and avgtime improved DT but worsened LR.