# Instagram Fake Account Detection

## Import libraries

In [1]:
from dataset.normalizer import csv_importer_full
from dataset.utils import find_demarcator, shuffle_and_split
from sequoia_comparison.utils import get_scores

import pandas as pd
from sklearn import tree, metrics
from sklearn.linear_model import LogisticRegression

## Import data

In [2]:
default_dataset = csv_importer_full("dataset/sources/user_fake_authentic_2class.csv")
idx = find_demarcator(default_dataset)

fake = default_dataset[:idx]
correct = default_dataset[idx:]

Now loading from file dataset/sources/user_fake_authentic_2class.csv...
Loaded 65327 entries from source dataset/sources/user_fake_authentic_2class.csv


## EXPERIMENT 26 - 04

Using custom experiment functions not to mess up with the real experiments.

In [3]:
'''
Drop target columns from dataset
'''
def get_custom_dataset(train_df, validation_df, column_names=[]):
    custom_train_df = train_df.drop(column_names, axis=1)
    custom_validation_df = validation_df.drop(column_names, axis=1)

    return custom_train_df, custom_validation_df

In [25]:
'''
column_names: list of columns to drop from default dataset to get custom dataset

modes:
 - "dt" => DecisionTree
 - "lr" => LogisticRegression
'''
def experiment(fake, correct, column_names=[], mode="dt", n_iter=20):
    avg_scores = {
        'default': {'precision': 0, 'accuracy': 0},
        'custom': {'precision': 0, 'accuracy': 0}
    }

    if mode == "dt":
        print(f"Calculating precision and accuracy metrics for Decision Trees over {n_iter} times")
    elif mode == "lr":
        print(f"Calculating precision and accuracy metrics for Logistic Regression over {n_iter} times")
    else:
        return -1

    for i in range(n_iter):
        # Get new train_df and validation_df, same for default and custom
        train_df, validation_df = shuffle_and_split(fake, correct)
        custom_train_df, custom_validation_df = get_custom_dataset(train_df, validation_df, column_names)

        # Default mode
        if mode == "dt":
            # Get new Decision Tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(train_df.iloc[:, :-2], train_df.iloc[:, -1])
        elif mode == "lr":
            # Get new Logistic Regressor
            clf = LogisticRegression(random_state=0, max_iter=10000)
            clf = clf.fit(train_df.iloc[:, :-2], train_df.iloc[:, -1])

        # Get ground truth and predictions to measure performance
        X_val, y_val = validation_df.iloc[:, :-2], validation_df.iloc[:, -1]
        y_pred = clf.predict(X_val)

        # Default scores
        scores = get_scores(y_val, y_pred)
        avg_scores['default']['precision'] += scores['precision']
        avg_scores['default']['accuracy'] += scores['accuracy']

        # Custom mode
        if mode == "dt":
            # Get new Decision Tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(custom_train_df.iloc[:, :-2], custom_train_df.iloc[:, -1])
        elif mode == "lr":
            # Get new Logistic Regressor
            clf = LogisticRegression(random_state=0, max_iter=2500)
            clf = clf.fit(custom_train_df.iloc[:, :-2], custom_train_df.iloc[:, -1])
        else:
            return -1

        # Get ground truth and predictions to measure performance
        X_val, y_val = custom_validation_df.iloc[:, :-2], custom_validation_df.iloc[:, -1]
        y_pred = clf.predict(X_val)

        # Custom scores
        scores = get_scores(y_val, y_pred)
        avg_scores['custom']['precision'] += scores['precision']
        avg_scores['custom']['accuracy'] += scores['accuracy']

        print(f"{i + 1}/{n_iter}", end="\r")

    # Averaging
    for t in avg_scores.keys():
        for s in avg_scores[t].keys():
            avg_scores[t][s] /= n_iter

    print('Done!\n\n')

    print('default avg precision:', "{:.3f}".format(avg_scores['default']['precision']))
    print('default avg accuracy:', "{:.3f}".format(avg_scores['default']['accuracy']))

    print('custom avg precision:', "{:.3f}".format(avg_scores['custom']['precision']))
    print('custom avg accuracy:', "{:.3f}".format(avg_scores['custom']['accuracy']))


## Evaluate impact upon removing single-attributes

Impact (bad/good) on performance is also evaluated from 1 (very small) to 5 (very big)

In [5]:
print(pd.DataFrame.from_dict(fake).columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


### nmedia - K

In [6]:
# Experiments
experiment(fake, correct, ['nmedia'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['nmedia'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.851
# default avg accuracy: 0.854
# custom avg precision: 0.850
# custom avg accuracy: 0.853
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.811
# default avg accuracy: 0.797
# custom avg precision: 0.811
# custom avg accuracy: 0.797

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.851
default avg accuracy: 0.854
custom avg precision: 0.850
custom avg accuracy: 0.853
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.811
default avg accuracy: 0.797
custom avg precision: 0.811
custom avg accuracy: 0.797


CONCLUSION: removing nmedia has a bad (1) impact on performance - KEEP IT !

### flw - K

In [26]:
# Experiments
experiment(fake, correct, ['flw'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['flw'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.850
# default avg accuracy: 0.853
# custom avg precision: 0.834
# custom avg accuracy: 0.837
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.809
# default avg accuracy: 0.797
# custom avg precision: 0.811
# custom avg accuracy: 0.798

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.850
default avg accuracy: 0.853
custom avg precision: 0.834
custom avg accuracy: 0.837
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.809
default avg accuracy: 0.797
custom avg precision: 0.811
custom avg accuracy: 0.798


CONCLUSION: removing flw has a bad (3) impact on performace (on DT, but positive (1) on LR) - CONSIDER DROPPING IT !

### flg - K

In [8]:
# Experiments
experiment(fake, correct, ['flg'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['flg'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.851
# default avg accuracy: 0.853
# custom avg precision: 0.800
# custom avg accuracy: 0.803
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.811
# default avg accuracy: 0.797
# custom avg precision: 0.745
# custom avg accuracy: 0.750

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.851
default avg accuracy: 0.853
custom avg precision: 0.800
custom avg accuracy: 0.803
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.811
default avg accuracy: 0.797
custom avg precision: 0.745
custom avg accuracy: 0.750


CONCLUSION: removing flg has a bad (5) impact on performance - KEEP IT !

### biol - K

In [9]:
# Experiments
experiment(fake, correct, ['biol'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['biol'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.851
# default avg accuracy: 0.854
# custom avg precision: 0.849
# custom avg accuracy: 0.852
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.810
# default avg accuracy: 0.798
# custom avg precision: 0.810
# custom avg accuracy: 0.795

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.851
default avg accuracy: 0.854
custom avg precision: 0.849
custom avg accuracy: 0.852
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.810
default avg accuracy: 0.798
custom avg precision: 0.810
custom avg accuracy: 0.795


CONCLUSION: removing biol has a bad (2) impact on performance - KEEP IT !

### pic - D

In [10]:
# Experiments
experiment(fake, correct, ['pic'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['pic'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.851
# default avg accuracy: 0.854
# custom avg precision: 0.852
# custom avg accuracy: 0.854
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.809
# default avg accuracy: 0.796
# custom avg precision: 0.807
# custom avg accuracy: 0.795

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.851
default avg accuracy: 0.854
custom avg precision: 0.852
custom avg accuracy: 0.854
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.809
default avg accuracy: 0.796
custom avg precision: 0.807
custom avg accuracy: 0.795


CONCLUSION: removing pic has a positive (2) impact on performance (on DT, but bad (1) on LR) - CONSIDER DROPPING IT !

### url - K

In [None]:
# Experiments
experiment(fake, correct, ['url'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['url'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.852
# default avg accuracy: 0.854
# custom avg precision: 0.803
# custom avg accuracy: 0.804
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.807
# default avg accuracy: 0.796
# custom avg precision: 0.787
# custom avg accuracy: 0.763

CONCLUSION: removing url has a bad (5) impact on performace - KEEP IT !

### cl - K

In [None]:
# Experiments
experiment(fake, correct, ['cl'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['cl'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.850
# default avg accuracy: 0.853
# custom avg precision: 0.849
# custom avg accuracy: 0.852
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.809
# default avg accuracy: 0.797
# custom avg precision: 0.809
# custom avg accuracy: 0.796

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.850
default avg accuracy: 0.853
custom avg precision: 0.849
custom avg accuracy: 0.852
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.809
default avg accuracy: 0.797
custom avg precision: 0.809
custom avg accuracy: 0.796


CONCLUSION: removing cl has a BAD (2) impact on performance - KEEP IT !

### cz - K

In [13]:
# Experiments
experiment(fake, correct, ['cz'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['cz'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.852
# default avg accuracy: 0.855
# custom avg precision: 0.851
# custom avg accuracy: 0.855
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.812
# default avg accuracy: 0.798
# custom avg precision: 0.815
# custom avg accuracy: 0.798

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.852
default avg accuracy: 0.855
custom avg precision: 0.851
custom avg accuracy: 0.855
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.812
default avg accuracy: 0.798
custom avg precision: 0.815
custom avg accuracy: 0.798


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


CONCLUSION: removing cz has a bad (1) impact on performace (on DT, but positive (1) on LR) - CONSIDER DROPPING IT !

### ni - D

In [14]:
# Experiments
experiment(fake, correct, ['ni'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['ni'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.852
# default avg accuracy: 0.854
# custom avg precision: 0.852
# custom avg accuracy: 0.854
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.811
# default avg accuracy: 0.796
# custom avg precision: 0.813
# custom avg accuracy: 0.798

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.852
default avg accuracy: 0.854
custom avg precision: 0.852
custom avg accuracy: 0.854
Calculating precision and accuracy metrics for Logistic Regression over 20 times
8/20

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done!


default avg precision: 0.811
default avg accuracy: 0.796
custom avg precision: 0.813
custom avg accuracy: 0.798


CONCLUSION: removing ni has a no impact on performance (on DT, but positive (1) on LR) - CONSIDER DROPPING IT !

### erl - K

In [15]:
# Experiments
experiment(fake, correct, ['erl'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['erl'], "lr", 20)   # LogisticRegression

# LOG
# default avg precision: 0.851
# default avg accuracy: 0.853
# custom avg precision: 0.835
# custom avg accuracy: 0.838
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.808
# default avg accuracy: 0.795
# custom avg precision: 0.808
# custom avg accuracy: 0.795

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.851
default avg accuracy: 0.853
custom avg precision: 0.835
custom avg accuracy: 0.838
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.808
default avg accuracy: 0.795
custom avg precision: 0.808
custom avg accuracy: 0.795


CONCLUSION: removing erl has a bad (3) impact on performace - KEEP IT !

### erc - K

In [16]:
# Experiments
experiment(fake, correct, ['erc'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['erc'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.850
# default avg accuracy: 0.854
# custom avg precision: 0.824
# custom avg accuracy: 0.826
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.810
# default avg accuracy: 0.795
# custom avg precision: 0.806
# custom avg accuracy: 0.791

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.850
default avg accuracy: 0.854
custom avg precision: 0.824
custom avg accuracy: 0.826
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.810
default avg accuracy: 0.795
custom avg precision: 0.806
custom avg accuracy: 0.791


CONCLUSION: removing erc has a bad (3) impact on performace - KEEP IT !

### lt - K

In [17]:
# Experiments
experiment(fake, correct, ['lt'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['lt'], "lr", 20)   # LogisticRegression

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.853
default avg accuracy: 0.855
custom avg precision: 0.851
custom avg accuracy: 0.854
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.810
default avg accuracy: 0.796
custom avg precision: 0.814
custom avg accuracy: 0.787


CONCLUSION: removing lt has a bad (1) impact on performace (on DT, but positive (1) on LR) - CONSIDER DROPING IT !

### ahc - K

In [18]:
# Experiments
experiment(fake, correct, ['ahc'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['ahc'], "lr", 20)   # LogisticRegression

#LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.851
# default avg accuracy: 0.854
# custom avg precision: 0.851
# custom avg accuracy: 0.854
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.808
# default avg accuracy: 0.795
# custom avg precision: 0.807
# custom avg accuracy: 0.794

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.851
default avg accuracy: 0.854
custom avg precision: 0.851
custom avg accuracy: 0.854
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.808
default avg accuracy: 0.795
custom avg precision: 0.807
custom avg accuracy: 0.794


CONCLUSION: removing ahc has a no impact on performace (on DT, but bad (1) on LR) - KEEP IT !

### pr - K

In [19]:
# Experiments
experiment(fake, correct, ['pr'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['pr'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.851
# default avg accuracy: 0.854
# custom avg precision: 0.849
# custom avg accuracy: 0.852
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.810
# default avg accuracy: 0.797
# custom avg precision: 0.806
# custom avg accuracy: 0.793

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.851
default avg accuracy: 0.854
custom avg precision: 0.849
custom avg accuracy: 0.852
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.810
default avg accuracy: 0.797
custom avg precision: 0.806
custom avg accuracy: 0.793


CONCLUSION: removing pr has a bad (1) impact on performace - KEEP IT !

### fo - K

In [20]:
# Experiments
experiment(fake, correct, ['fo'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['fo'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.851
# default avg accuracy: 0.854
# custom avg precision: 0.850
# custom avg accuracy: 0.853
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.809
# default avg accuracy: 0.797
# custom avg precision: 0.808
# custom avg accuracy: 0.795

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.851
default avg accuracy: 0.854
custom avg precision: 0.850
custom avg accuracy: 0.853
Calculating precision and accuracy metrics for Logistic Regression over 20 times
Done!


default avg precision: 0.809
default avg accuracy: 0.797
custom avg precision: 0.808
custom avg accuracy: 0.795


CONCLUSION: removing fo has a bad (1) impact on performace - KEEP IT !

### cs - K

In [21]:
# Experiments
experiment(fake, correct, ['cs'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['cs'], "lr", 20)   # LogisticRegression

# LOG
# Calculating precision and accuracy metrics for Decision Trees over 20 times
# Done!


# default avg precision: 0.851
# default avg accuracy: 0.854
# custom avg precision: 0.849
# custom avg accuracy: 0.853
# Calculating precision and accuracy metrics for Logistic Regression over 20 times
# Done!


# default avg precision: 0.811
# default avg accuracy: 0.797
# custom avg precision: 0.785
# custom avg accuracy: 0.788

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.851
default avg accuracy: 0.854
custom avg precision: 0.849
custom avg accuracy: 0.853
Calculating precision and accuracy metrics for Logistic Regression over 20 times
2/20

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done!


default avg precision: 0.811
default avg accuracy: 0.797
custom avg precision: 0.785
custom avg accuracy: 0.788


CONCLUSION: removing cs has a small bad (2) impact on performace - KEEP IT !

### avgtime - K

In [22]:
# Experiments
experiment(fake, correct, ['avgtime'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['avgtime'], "lr", 20)   # LogisticRegression

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.852
default avg accuracy: 0.854
custom avg precision: 0.851
custom avg accuracy: 0.853
Calculating precision and accuracy metrics for Logistic Regression over 20 times
14/20

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Done!


default avg precision: 0.809
default avg accuracy: 0.797
custom avg precision: 0.784
custom avg accuracy: 0.788


CONCLUSION: removing flw has a bad (2) impact on performace - KEEP IT !

## Second experiment with custom features

In this experiment we fit (several) Decision Tree Classifier(s) (and Linear Regressors) removing from dataframes the attributes which seemed to worsen performance during the experiments.

In [29]:
# Experiments
experiment(fake, correct, ["pic", "ni"], "dt", 20)   # DecisionTree
experiment(fake, correct, ["pic", "ni"], "lr", 20)   # LogisticRegression

Calculating precision and accuracy metrics for Decision Trees over 20 times
Done!


default avg precision: 0.852
default avg accuracy: 0.855
custom avg precision: 0.850
custom avg accuracy: 0.854
Calculating precision and accuracy metrics for Logistic Regression over 20 times
1/20

KeyboardInterrupt: 

CONCLUSION: removing pic, cl, ni, lt and ahc columns improved performances !