# Instagram Fake Account Detection

## Import libraries

In [1]:
from dataset.normalizer import csv_importer, csv_importer_full
import csv
import random
import pandas as pd
from sklearn import tree, metrics

## Data pre-processing

Define function to find demarcator

In [2]:
def find_demarcator(dataset):
    """
    Restituisce l'indice del primo elemento non fake
    :param dataset: il dataset
    :return: l'indice
    """
    idx = 0
    for elem in dataset:
        if elem['fake'] == 1:
            idx += 1
        else:
            break
    return idx

Set train:test ratio

In [3]:
PERCENT_TRAIN = 70

Import dataset

In [4]:
default_dataset = csv_importer_full("./dataset/sources/user_fake_authentic_2class.csv")

Now loading from file ./dataset/sources/user_fake_authentic_2class.csv...
Loaded 65327 entries from source ./dataset/sources/user_fake_authentic_2class.csv


Split dataset into (balanced) training and validation sets

In [5]:
print(f"Now splitting dataset with ratio {PERCENT_TRAIN}:{100 - PERCENT_TRAIN}")

# Find demarcator (in the original datasets all fake accounts are at the beginning)
idx = find_demarcator(default_dataset)
# Separate fakes from real accounts
fake = default_dataset[:idx]
correct = default_dataset[idx:]
# Shuffle both datatets (otherwise, train and validation sets would always contain the same elements)
random.shuffle(fake)
random.shuffle(correct)
# Create training set
train = fake[:int(len(fake) * (PERCENT_TRAIN / 100))]
train += correct[:int(len(correct) * (PERCENT_TRAIN / 100))]
# Create validation set
validation = fake[int(len(fake) * (PERCENT_TRAIN / 100)):]
validation += correct[int(len(correct) * (PERCENT_TRAIN / 100)):]
# Shuffle both datasets
random.shuffle(train)
random.shuffle(validation)

print("Loading complete.")

Now splitting dataset with ratio 70:30
Loading complete.


Cast to pandas dataframes

In [6]:
train_df = pd.DataFrame.from_dict(train)
validation_df = pd.DataFrame.from_dict(validation)
print(train_df)
print(validation_df)

       nmedia     flw     flg   biol  pic  url     cl        cz     ni  \
0         7.0    92.0  6700.0  136.0  1.0  0.0   31.0  0.000000  0.000   
1        13.0   365.0  4100.0    0.0  1.0  0.0    4.0  0.538462  0.231   
2         2.0    11.0   125.0    0.0  0.0  0.0    0.0  1.000000  0.500   
3        72.0   311.0  7400.0    0.0  1.0  0.0    1.0  0.944444  0.000   
4         0.0  2300.0  7500.0   12.0  0.0  0.0    0.0  0.000000  0.000   
...       ...     ...     ...    ...  ...  ...    ...       ...    ...   
45723    66.0   277.0   469.0  149.0  1.0  1.0   64.0  0.944444  0.000   
45724     1.0   140.0  7300.0    0.0  1.0  0.0    0.0  1.000000  0.000   
45725     1.0   431.0  7300.0   59.0  1.0  0.0    0.0  1.000000  1.000   
45726    71.0   279.0  2200.0    0.0  1.0  0.0    1.0  0.944444  0.000   
45727    81.0   577.0  1300.0   62.0  1.0  0.0  449.0  0.000000  0.333   

              erl   erc     lt    ahc     pr     fo        cs     avgtime  \
0       55.119999  2.02  0.000  2.

## Training

In [7]:
# Default tree
X, y = train_df.iloc[:, :-2], train_df.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


## Evaluation

In [8]:
X_val, y_val = validation_df.iloc[:, :-2], validation_df.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.85      0.85      9738
           1       0.85      0.86      0.86      9860

    accuracy                           0.85     19598
   macro avg       0.85      0.85      0.85     19598
weighted avg       0.85      0.85      0.85     19598



## First experiment with custom features

In [9]:
custom_dataset = csv_importer("./dataset/sources/user_fake_authentic_2class.csv")

custom_fake = custom_dataset[:idx]
custom_correct = custom_dataset[idx:]

random.shuffle(custom_fake)
random.shuffle(custom_correct)

custom_train = custom_fake[:int(len(custom_fake) * (PERCENT_TRAIN / 100))]
custom_train += custom_correct[:int(len(custom_correct) * (PERCENT_TRAIN / 100))]

custom_validation = custom_fake[int(len(custom_fake) * (PERCENT_TRAIN / 100)):]
custom_validation += custom_correct[int(len(custom_correct) * (PERCENT_TRAIN / 100)):]

random.shuffle(custom_train)
random.shuffle(custom_validation)

print("Loading complete.")

train_df = pd.DataFrame.from_dict(train)
validation_df = pd.DataFrame.from_dict(validation)
#print(train_df)
#print(validation_df)

custom_train_df = pd.DataFrame.from_dict(custom_train)
custom_validation_df = pd.DataFrame.from_dict(custom_validation)
#print(custom_train_df)
#print(custom_validation_df)

# Custom tree
cX, cy = custom_train_df.iloc[:,:-2], custom_train_df.iloc[:,-1]
cclf = tree.DecisionTreeClassifier()
cclf = cclf.fit(cX, cy)
print("Fitting complete.")

cX_val, cy_val = custom_validation_df.iloc[:,:-2], validation_df.iloc[:, -1]
cy_pred = cclf.predict(cX_val)

print(metrics.classification_report(cy_val,cy_pred))

Now loading from file ./dataset/sources/user_fake_authentic_2class.csv...
Loaded 65327 entries from source ./dataset/sources/user_fake_authentic_2class.csv
Loading complete.
Fitting complete.
              precision    recall  f1-score   support

           0       0.50      0.49      0.49      9738
           1       0.50      0.51      0.51      9860

    accuracy                           0.50     19598
   macro avg       0.50      0.50      0.50     19598
weighted avg       0.50      0.50      0.50     19598



## Evaluate impact upon removing single-attributes

### nmedia

In [10]:
print(train_df.columns)
#print(train_df)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove nmedia column from training and validation dataframes

In [14]:
train_drop_nmedia = train_df.drop(['nmedia'], axis=1)
validation_drop_nmedia = validation_df.drop(['nmedia'], axis=1)
#print(train_drop_nmedia)

Training

In [15]:
X, y = train_drop_nmedia.iloc[:, :-2], train_drop_nmedia.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [16]:
X_val, y_val = validation_drop_nmedia.iloc[:, :-2], validation_drop_nmedia.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85      9738
           1       0.85      0.86      0.85      9860

    accuracy                           0.85     19598
   macro avg       0.85      0.85      0.85     19598
weighted avg       0.85      0.85      0.85     19598

