# Instagram Fake Account Detection

## Import libraries

In [1]:
from dataset.normalizer import csv_importer, csv_importer_full
import csv
import random
import pandas as pd
from sklearn import tree, metrics

## Data pre-processing

Define function to find demarcator

In [2]:
def find_demarcator(dataset):
    """
    Restituisce l'indice del primo elemento non fake
    :param dataset: il dataset
    :return: l'indice
    """
    idx = 0
    for elem in dataset:
        if elem['fake'] == 1:
            idx += 1
        else:
            break
    return idx

Set train:test ratio

In [3]:
PERCENT_TRAIN = 70

Import dataset

In [5]:
default_dataset = csv_importer_full("./dataset/sources/user_fake_authentic_2class.csv")

Now loading from file ./dataset/sources/user_fake_authentic_2class.csv...
Loaded 65327 entries from source ./dataset/sources/user_fake_authentic_2class.csv


Split dataset into (balanced) training and validation sets

In [6]:
print(f"Now splitting dataset with ratio {PERCENT_TRAIN}:{100 - PERCENT_TRAIN}")

# Find demarcator (in the original datasets all fake accounts are at the beginning)
idx = find_demarcator(default_dataset)
# Separate fakes from real accounts
fake = default_dataset[:idx]
correct = default_dataset[idx:]
# Shuffle both datatets (otherwise, train and validation sets would always contain the same elements)
random.shuffle(fake)
random.shuffle(correct)
# Create training set
train = fake[:int(len(fake) * (PERCENT_TRAIN / 100))]
train += correct[:int(len(correct) * (PERCENT_TRAIN / 100))]
# Create validation set
validation = fake[int(len(fake) * (PERCENT_TRAIN / 100)):]
validation += correct[int(len(correct) * (PERCENT_TRAIN / 100)):]
# Shuffle both datasets
random.shuffle(train)
random.shuffle(validation)

print("Loading complete.")

Now splitting dataset with ratio 70:30
Loading complete.


Cast to pandas dataframes

In [7]:
train_df = pd.DataFrame.from_dict(train)
validation_df = pd.DataFrame.from_dict(validation)
print(train_df)
print(validation_df)

       nmedia     flw     flg   biol  pic  url     cl        cz     ni  \
0       136.0  2100.0  3300.0    8.0  1.0  0.0  139.0  0.000000  0.389   
1         6.0    69.0   582.0    9.0  1.0  0.0  126.0  0.000000  0.000   
2      1300.0   978.0   486.0  114.0  1.0  1.0  188.0  0.000000  0.000   
3         2.0   136.0  7300.0    0.0  1.0  0.0    0.0  1.000000  0.000   
4         0.0    16.0    82.0    0.0  1.0  0.0    0.0  0.000000  0.000   
...       ...     ...     ...    ...  ...  ...    ...       ...    ...   
45723     0.0     0.0    78.0    0.0  1.0  0.0    0.0  0.000000  0.000   
45724     0.0   148.0  7300.0    0.0  1.0  0.0    0.0  0.000000  0.000   
45725    39.0   820.0   733.0   69.0  1.0  1.0   50.0  0.055556  0.389   
45726    57.0   976.0  7400.0   28.0  1.0  0.0  152.0  0.000000  0.167   
45727   430.0   320.0   686.0    0.0  1.0  0.0   33.0  0.000000  0.444   

              erl   erc     lt    ahc     pr   fo        cs      avgtime  fake  
0       11.170000  0.87  0.000

## Training

In [9]:
# Default tree
X, y = train_df.iloc[:, :-2], train_df.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


## Evaluation

In [10]:
X_val, y_val = validation_df.iloc[:, :-2], validation_df.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      9738
           1       0.86      0.86      0.86      9860

    accuracy                           0.86     19598
   macro avg       0.86      0.86      0.86     19598
weighted avg       0.86      0.86      0.86     19598



## First experiment with custom features

In [13]:
custom_dataset = csv_importer("./dataset/sources/user_fake_authentic_2class.csv")

custom_fake = custom_dataset[:idx]
custom_correct = custom_dataset[idx:]

random.shuffle(custom_fake)
random.shuffle(custom_correct)

custom_train = custom_fake[:int(len(custom_fake) * (PERCENT_TRAIN / 100))]
custom_train += custom_correct[:int(len(custom_correct) * (PERCENT_TRAIN / 100))]

custom_validation = custom_fake[int(len(custom_fake) * (PERCENT_TRAIN / 100)):]
custom_validation += custom_correct[int(len(custom_correct) * (PERCENT_TRAIN / 100)):]

random.shuffle(custom_train)
random.shuffle(custom_validation)

print("Loading complete.")

train_df = pd.DataFrame.from_dict(train)
validation_df = pd.DataFrame.from_dict(validation)
#print(train_df)
#print(validation_df)

custom_train_df = pd.DataFrame.from_dict(custom_train)
custom_validation_df = pd.DataFrame.from_dict(custom_validation)
#print(custom_train_df)
#print(custom_validation_df)

# Custom tree
cX, cy = custom_train_df.iloc[:,:-2], custom_train_df.iloc[:,-1]
cclf = tree.DecisionTreeClassifier()
cclf = cclf.fit(cX, cy)
print("Fitting complete.")

cX_val, cy_val = custom_validation_df.iloc[:,:-2], validation_df.iloc[:, -1]
cy_pred = cclf.predict(cX_val)

print(metrics.classification_report(cy_val,cy_pred))

Now loading from file ./dataset/sources/user_fake_authentic_2class.csv...
Loaded 65327 entries from source ./dataset/sources/user_fake_authentic_2class.csv
Loading complete.
Fitting complete.
              precision    recall  f1-score   support

           0       0.50      0.50      0.50      9738
           1       0.51      0.51      0.51      9860

    accuracy                           0.50     19598
   macro avg       0.50      0.50      0.50     19598
weighted avg       0.50      0.50      0.50     19598

