# Instagram Fake Account Detection

## Import libraries

In [15]:
from dataset.normalizer import json_importer_full
from dataset.utils import shuffle_and_split

import pandas as pd
from sklearn import tree, metrics
from sklearn.linear_model import LogisticRegression

## Import data

In [8]:
fake = json_importer_full("./dataset/sources/automatedAccountData.json", True)
correct = json_importer_full("./dataset/sources/nonautomatedAccountData.json", False)

Now loading from file ./dataset/sources/automatedAccountData.json...
Loaded 700 entries from source ./dataset/sources/automatedAccountData.json
Now loading from file ./dataset/sources/nonautomatedAccountData.json...
Loaded 700 entries from source ./dataset/sources/nonautomatedAccountData.json


## EXPERIMENT 29 - 06

### Macros

In [4]:
N_EXP = 100 # Number of experiments
MAX_ITER = 50000 # Maximum number of iterations for LR

### Functions

Using custom experiment functions not to mess up with the real experiments.

In [5]:
'''
Drop target columns from dataset
'''
def get_custom_dataset(train_df, validation_df, column_names=[]):
    custom_train_df = train_df.drop(column_names, axis=1)
    custom_validation_df = validation_df.drop(column_names, axis=1)

    return custom_train_df, custom_validation_df

In [24]:
'''
column_names: list of columns to drop from default dataset to get custom dataset

modes:
 - "dt" => DecisionTree
 - "lr" => LogisticRegression
'''
def experiment(fake, correct, column_names=[], mode="dt", n_iter=N_EXP):
    avg_scores = {
        'default': {'precision': 0, 'accuracy': 0, 'recall': 0, 'f1': 0},
        'custom': {'precision': 0, 'accuracy': 0, 'recall': 0, 'f1': 0}
    }

    if mode == "dt":
        print(f"Calculating precision and accuracy metrics for Decision Trees over {n_iter} times")
    elif mode == "lr":
        print(f"Calculating precision and accuracy metrics for Logistic Regression (max_iter={MAX_ITER}) over {n_iter} times")
    else:
        return -1

    for i in range(n_iter):
        # Get new train_df and validation_df, same for default and custom
        train_df, validation_df = shuffle_and_split(fake, correct)
        custom_train_df, custom_validation_df = get_custom_dataset(train_df, validation_df, column_names)

        # Default mode
        if mode == "dt":
            # Get new Decision Tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(train_df.iloc[:, :-2], train_df.iloc[:, -1])
        elif mode == "lr":
            # Get new Logistic Regressor
            clf = LogisticRegression(random_state=0, max_iter=MAX_ITER)
            clf = clf.fit(train_df.iloc[:, :-2], train_df.iloc[:, -1])

        # Get ground truth and predictions to measure performance
        X_val, y_val = validation_df.iloc[:, :-2], validation_df.iloc[:, -1]
        y_pred = clf.predict(X_val)

        # Default scores
        scores = get_scores(y_val, y_pred)
        avg_scores['default']['precision'] += scores['precision']
        avg_scores['default']['accuracy'] += scores['accuracy']
        avg_scores['default']['recall'] += scores['recall']
        avg_scores['default']['f1'] += scores['f1']

        # Custom mode
        if mode == "dt":
            # Get new Decision Tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(custom_train_df.iloc[:, :-2], custom_train_df.iloc[:, -1])
        elif mode == "lr":
            # Get new Logistic Regressor
            clf = LogisticRegression(random_state=0, max_iter=2500)
            clf = clf.fit(custom_train_df.iloc[:, :-2], custom_train_df.iloc[:, -1])
        else:
            return -1

        # Get ground truth and predictions to measure performance
        X_val, y_val = custom_validation_df.iloc[:, :-2], custom_validation_df.iloc[:, -1]
        y_pred = clf.predict(X_val)

        # Custom scores
        scores = get_scores(y_val, y_pred)
        avg_scores['custom']['precision'] += scores['precision']
        avg_scores['custom']['accuracy'] += scores['accuracy']
        avg_scores['custom']['recall'] += scores['recall']
        avg_scores['custom']['f1'] += scores['f1']

        #print(f"{i + 1}/{n_iter}", end="\r")

    # Averaging
    for t in avg_scores.keys():
        for s in avg_scores[t].keys():
            avg_scores[t][s] /= n_iter

    print('Done!\n\n')

    print('default avg precision:', "{:.5f}".format(avg_scores['default']['precision']))
    print('default avg accuracy:', "{:.5f}".format(avg_scores['default']['accuracy']))
    print('default avg recall:', "{:.5f}".format(avg_scores['default']['recall']))
    print('default avg f1-score:', "{:.5f}".format(avg_scores['default']['f1']))
    print('---')
    print('custom avg precision:', "{:.5f}".format(avg_scores['custom']['precision']))
    print('custom avg accuracy:', "{:.5f}".format(avg_scores['custom']['accuracy']))
    print('custom avg recall:', "{:.5f}".format(avg_scores['custom']['recall']))
    print('custom avg f1-score:', "{:.5f}".format(avg_scores['custom']['f1']))


In [22]:
def get_scores(y_val, y_pred):
    scores = {
        'accuracy': 0,
        'precision': 0,
        'recall': 0,
        'f1': 0
    }
    scores['accuracy'] += metrics.accuracy_score(y_val, y_pred)
    scores['precision'] += metrics.precision_score(y_val, y_pred)
    scores['recall'] += metrics.recall_score(y_val, y_pred)
    scores['f1'] += metrics.f1_score(y_val, y_pred)
    return scores

## Evaluate impact upon removing single-attributes

Impact (bad/good) on performance is also evaluated from 1 (very small) to 5 (very big)

In [17]:
print(pd.DataFrame.from_dict(fake).columns)

Index(['nmedia', 'biol', 'url', 'nfollowing', 'nfollower', 'mediaLikeNumbers',
       'mediaHashtagNumbers', 'followerToFollowing', 'hasMedia',
       'userHasHighlighReels', 'usernameLength', 'usernameDigitCount', 'fake'],
      dtype='object')


### nmedia (keep)

In [25]:
experiment(fake, correct, ['nmedia'], "dt")   # DecisionTree

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.90915
default avg accuracy: 0.91128
default avg recall: 0.91441
default avg f1-score: 0.91156
---
custom avg precision: 0.90841
custom avg accuracy: 0.91007
custom avg recall: 0.91251
custom avg f1-score: 0.91028


### biol (keep)

In [26]:
experiment(fake, correct, ['biol'], "dt")   # DecisionTree

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.90672
default avg accuracy: 0.90725
default avg recall: 0.90844
default avg f1-score: 0.90731
---
custom avg precision: 0.89562
custom avg accuracy: 0.89808
custom avg recall: 0.90161
custom avg f1-score: 0.89839


### url (drop)

In [27]:
experiment(fake, correct, ['url'], "dt")   # DecisionTree

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.91078
default avg accuracy: 0.90993
default avg recall: 0.90953
default avg f1-score: 0.90989
---
custom avg precision: 0.91180
custom avg accuracy: 0.91078
custom avg recall: 0.91009
custom avg f1-score: 0.91071


### nfollowing (try)

In [28]:
experiment(fake, correct, ['nfollowing'], "dt")   # DecisionTree

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.90314
default avg accuracy: 0.90531
default avg recall: 0.90844
default avg f1-score: 0.90556
---
custom avg precision: 0.90579
custom avg accuracy: 0.90588
custom avg recall: 0.90649
custom avg f1-score: 0.90592


### nfollower (try)

In [29]:
experiment(fake, correct, ['nfollower'], "dt")   # DecisionTree

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.90400
default avg accuracy: 0.90583
default avg recall: 0.90863
default avg f1-score: 0.90605
---
custom avg precision: 0.91011
custom avg accuracy: 0.90623
custom avg recall: 0.90190
custom avg f1-score: 0.90575


### mediaLikeNumbers (drop)

In [30]:
experiment(fake, correct, ['mediaLikeNumbers'], "dt")   # DecisionTree

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.90746
default avg accuracy: 0.90730
default avg recall: 0.90754
default avg f1-score: 0.90730
---
custom avg precision: 0.90885
custom avg accuracy: 0.90948
custom avg recall: 0.91071
custom avg f1-score: 0.90957


### mediaHashtagNumbers (keep)

In [32]:
experiment(fake, correct, ['mediaHashtagNumbers'], "dt")   # DecisionTree

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.90920
default avg accuracy: 0.90642
default avg recall: 0.90355
default avg f1-score: 0.90615
---
custom avg precision: 0.90646
custom avg accuracy: 0.90282
custom avg recall: 0.89886
custom avg f1-score: 0.90239


### followerToFollowing (keep)

In [33]:
experiment(fake, correct, ['followerToFollowing'], "dt")   # DecisionTree

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.91026
default avg accuracy: 0.90799
default avg recall: 0.90578
default avg f1-score: 0.90776
---
custom avg precision: 0.90432
custom avg accuracy: 0.89912
custom avg recall: 0.89336
custom avg f1-score: 0.89852


### hasMedia (try)

In [34]:
experiment(fake, correct, ['hasMedia'], "dt")   # DecisionTree

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.90882
default avg accuracy: 0.90938
default avg recall: 0.91076
default avg f1-score: 0.90950
---
custom avg precision: 0.90924
custom avg accuracy: 0.90919
custom avg recall: 0.90986
custom avg f1-score: 0.90923


### userHasHighlighReels (try)

In [35]:
experiment(fake, correct, ['userHasHighlighReels'], "dt")   # DecisionTree

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.90971
default avg accuracy: 0.90820
default avg recall: 0.90678
default avg f1-score: 0.90803
---
custom avg precision: 0.91151
custom avg accuracy: 0.90903
custom avg recall: 0.90640
custom avg f1-score: 0.90874


### usernameLength (drop)

In [36]:
experiment(fake, correct, ['usernameLength'], "dt")   # DecisionTree

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.90698
default avg accuracy: 0.90735
default avg recall: 0.90839
default avg f1-score: 0.90741
---
custom avg precision: 0.90907
custom avg accuracy: 0.90969
custom avg recall: 0.91114
custom avg f1-score: 0.90981


### usernameDigitCount (drop)

In [37]:
experiment(fake, correct, ['usernameDigitCount'], "dt")   # DecisionTree

Calculating precision and accuracy metrics for Decision Trees over 100 times
Done!


default avg precision: 0.90527
default avg accuracy: 0.90614
default avg recall: 0.90777
default avg f1-score: 0.90626
---
custom avg precision: 0.90868
custom avg accuracy: 0.90922
custom avg recall: 0.91052
custom avg f1-score: 0.90933
