# Instagram Fake Account Detection

## Import libraries

In [None]:
from dataset.normalizer import csv_importer_full
from dataset.utils import find_demarcator, shuffle_and_split
from sequoia_comparison.utils import get_scores

import pandas as pd
from sklearn import tree, metrics
from sklearn.linear_model import LogisticRegression

## Import data

In [None]:
default_dataset = csv_importer_full("dataset/sources/user_fake_authentic_2class.csv")
idx = find_demarcator(default_dataset)

fake = default_dataset[:idx]
correct = default_dataset[idx:]

## EXPERIMENT 26 - 04

Using custom experiment functions not to mess up with the real experiments.

In [None]:
'''
Drop target columns from dataset
'''
def get_custom_dataset(train_df, validation_df, column_names=[]):
    custom_train_df = train_df.drop(column_names, axis=1)
    custom_validation_df = validation_df.drop(column_names, axis=1)

    return custom_train_df, custom_validation_df

In [None]:
'''
column_names: list of columns to drop from default dataset to get custom dataset

modes:
 - "dt" => DecisionTree
 - "lr" => LogisticRegression
'''
def experiment(fake, correct, column_names=[], mode="dt", n_iter=20):
    avg_scores = {
        'default': {'precision': 0, 'accuracy': 0},
        'custom': {'precision': 0, 'accuracy': 0}
    }

    if mode == "dt":
        print(f"Calculating precision and accuracy metrics for Decision Trees over {n_iter} times")
    elif mode == "lr":
        print(f"Calculating precision and accuracy metrics for Logistic Regression over {n_iter} times")
    else:
        return -1

    for i in range(n_iter):
        # Get new train_df and validation_df, same for default and custom
        train_df, validation_df = shuffle_and_split(fake, correct)
        custom_train_df, custom_validation_df = get_custom_dataset(train_df, validation_df, column_names)

        # Default mode
        if mode == "dt":
            # Get new Decision Tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(train_df.iloc[:, :-2], train_df.iloc[:, -1])
        elif mode == "lr":
            # Get new Logistic Regressor
            clf = LogisticRegression(random_state=0, max_iter=5000)
            clf = clf.fit(train_df.iloc[:, :-2], train_df.iloc[:, -1])

        # Get ground truth and predictions to measure performance
        X_val, y_val = validation_df.iloc[:, :-2], validation_df.iloc[:, -1]
        y_pred = clf.predict(X_val)

        # Default scores
        scores = get_scores(y_val, y_pred)
        avg_scores['default']['precision'] += scores['precision']
        avg_scores['default']['accuracy'] += scores['accuracy']

        # Custom mode
        if mode == "dt":
            # Get new Decision Tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(custom_train_df.iloc[:, :-2], custom_train_df.iloc[:, -1])
        elif mode == "lr":
            # Get new Logistic Regressor
            clf = LogisticRegression(random_state=0, max_iter=2500)
            clf = clf.fit(custom_train_df.iloc[:, :-2], custom_train_df.iloc[:, -1])
        else:
            return -1

        # Get ground truth and predictions to measure performance
        X_val, y_val = custom_validation_df.iloc[:, :-2], custom_validation_df.iloc[:, -1]
        y_pred = clf.predict(X_val)

        # Custom scores
        scores = get_scores(y_val, y_pred)
        avg_scores['custom']['precision'] += scores['precision']
        avg_scores['custom']['accuracy'] += scores['accuracy']

        print(f"{i + 1}/{n_iter}", end="\r")

    # Averaging
    for t in avg_scores.keys():
        for s in avg_scores[t].keys():
            avg_scores[t][s] /= n_iter

    print('Done!\n\n')

    print('default avg precision:', "{:.3f}".format(avg_scores['default']['precision']))
    print('default avg accuracy:', "{:.3f}".format(avg_scores['default']['accuracy']))

    print('custom avg precision:', "{:.3f}".format(avg_scores['custom']['precision']))
    print('custom avg accuracy:', "{:.3f}".format(avg_scores['custom']['accuracy']))


## Evaluate impact upon removing single-attributes

Impact (bad/good) on performance is also evaluated from 1 (very small) to 5 (very big)

In [None]:
print(pd.DataFrame.from_dict(fake).columns)

### nmedia

In [None]:
# Experiments
experiment(fake, correct, ['nmedia'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['nmedia'], "lr", 20)   # LogisticRegression

CONCLUSION: removing nmedia have a bad (1) impact on performance - KEEP IT !

### flw

In [None]:
# Experiments
experiment(fake, correct, ['flw'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['flw'], "lr", 20)   # LogisticRegression

CONCLUSION: removing flw has a (bad) impact on performace

### flg

In [None]:
# Experiments
experiment(fake, correct, ['flg'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['flg'], "lr", 20)   # LogisticRegression

CONCLUSION: removing flg has a big (bad) impact on performace

### biol

In [None]:
# Experiments
experiment(fake, correct, ['biol'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['biol'], "lr", 20)   # LogisticRegression

CONCLUSION: removing biol has a small impact on performace (better f1-score, worse recall)

### pic

In [None]:
# Experiments
experiment(fake, correct, ['pic'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['pic'], "lr", 20)   # LogisticRegression

CONCLUSION: removing pic has a small (positive) impact on performace

### url

In [None]:
# Experiments
experiment(fake, correct, ['url'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['url'], "lr", 20)   # LogisticRegression

CONCLUSION: removing url has a big (bad) impact on performace

### cl

In [None]:
# Experiments
experiment(fake, correct, ['cl'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['cl'], "lr", 20)   # LogisticRegression

CONCLUSION: removing cl has a small (positive) impact on performace

### cz

In [None]:
# Experiments
experiment(fake, correct, ['cz'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['cz'], "lr", 20)   # LogisticRegression

CONCLUSION: removing cz has a small (bad) impact on performace

### ni

In [None]:
# Experiments
experiment(fake, correct, ['ni'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['ni'], "lr", 20)   # LogisticRegression

CONCLUSION: removing ni has a small (positive) impact on performace

### erl

In [None]:
# Experiments
experiment(fake, correct, ['erl'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['erl'], "lr", 20)   # LogisticRegression

CONCLUSION: removing erl has a (bad) impact on performace

### erc

In [None]:
# Experiments
experiment(fake, correct, ['erc'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['erc'], "lr", 20)   # LogisticRegression

CONCLUSION: removing erc has a (bad) impact on performace

### lt

In [None]:
# Experiments
experiment(fake, correct, ['lt'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['lt'], "lr", 20)   # LogisticRegression

CONCLUSION: removing lt has a (positive) impact on performace

### ahc

In [None]:
# Experiments
experiment(fake, correct, ['ahc'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['ahc'], "lr", 20)   # LogisticRegression

CONCLUSION: removing ahc has a (positive) impact on performace

### pr

In [None]:
# Experiments
experiment(fake, correct, ['pr'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['pr'], "lr", 20)   # LogisticRegression

CONCLUSION: removing pr has a small (bad) impact on performace

### fo

In [None]:
# Experiments
experiment(fake, correct, ['fo'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['fo'], "lr", 20)   # LogisticRegression

CONCLUSION: removing fo has no impact on performace

### cs

In [None]:
# Experiments
experiment(fake, correct, ['cs'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['cs'], "lr", 20)   # LogisticRegression

CONCLUSION: removing cs has a small (bad) impact on performace

### avgtime

In [None]:
# Experiments
experiment(fake, correct, ['avgtime'], "dt", 20)   # DecisionTree
experiment(fake, correct, ['avgtime'], "lr", 20)   # LogisticRegression

CONCLUSION: removing flw has a small (bad) impact on performace

## Second experiment with custom features

In this experiment we fit (several) Decision Tree Classifier(s) (and Linear Regressors) removing from dataframes the attributes which seemed to worsen performance during the experiments.

In [None]:
# Experiments
experiment(fake, correct, [], "dt", 20)   # DecisionTree
experiment(fake, correct, [], "lr", 20)   # LogisticRegression

CONCLUSION: removing pic, cl, ni, lt and ahc columns improved performances !