# Instagram Fake Account Detection

## Import libraries

In [1]:
from dataset.normalizer import csv_importer, csv_importer_full
import csv
import random
import pandas as pd
from sklearn import tree, metrics

## Data pre-processing

Define function to find demarcator

In [2]:
def find_demarcator(dataset):
    """
    Restituisce l'indice del primo elemento non fake
    :param dataset: il dataset
    :return: l'indice
    """
    idx = 0
    for elem in dataset:
        if elem['fake'] == 1:
            idx += 1
        else:
            break
    return idx

Set train:test ratio

In [3]:
PERCENT_TRAIN = 70

Import dataset

In [4]:
default_dataset = csv_importer_full("./dataset/sources/user_fake_authentic_2class.csv")

Now loading from file ./dataset/sources/user_fake_authentic_2class.csv...
Loaded 65327 entries from source ./dataset/sources/user_fake_authentic_2class.csv


Split dataset into (balanced) training and validation sets

In [5]:
print(f"Now splitting dataset with ratio {PERCENT_TRAIN}:{100 - PERCENT_TRAIN}")

# Find demarcator (in the original datasets all fake accounts are at the beginning)
idx = find_demarcator(default_dataset)
# Separate fakes from real accounts
fake = default_dataset[:idx]
correct = default_dataset[idx:]
# Shuffle both datatets (otherwise, train and validation sets would always contain the same elements)
random.shuffle(fake)
random.shuffle(correct)
# Create training set
train = fake[:int(len(fake) * (PERCENT_TRAIN / 100))]
train += correct[:int(len(correct) * (PERCENT_TRAIN / 100))]
# Create validation set
validation = fake[int(len(fake) * (PERCENT_TRAIN / 100)):]
validation += correct[int(len(correct) * (PERCENT_TRAIN / 100)):]
# Shuffle both datasets
random.shuffle(train)
random.shuffle(validation)

print("Loading complete.")

Now splitting dataset with ratio 70:30
Loading complete.


Cast to pandas dataframes

In [6]:
train_df = pd.DataFrame.from_dict(train)
validation_df = pd.DataFrame.from_dict(validation)
print(train_df)
print(validation_df)

       nmedia     flw     flg   biol  pic  url     cl        cz     ni  \
0         7.0    92.0  6700.0  136.0  1.0  0.0   31.0  0.000000  0.000   
1        13.0   365.0  4100.0    0.0  1.0  0.0    4.0  0.538462  0.231   
2         2.0    11.0   125.0    0.0  0.0  0.0    0.0  1.000000  0.500   
3        72.0   311.0  7400.0    0.0  1.0  0.0    1.0  0.944444  0.000   
4         0.0  2300.0  7500.0   12.0  0.0  0.0    0.0  0.000000  0.000   
...       ...     ...     ...    ...  ...  ...    ...       ...    ...   
45723    66.0   277.0   469.0  149.0  1.0  1.0   64.0  0.944444  0.000   
45724     1.0   140.0  7300.0    0.0  1.0  0.0    0.0  1.000000  0.000   
45725     1.0   431.0  7300.0   59.0  1.0  0.0    0.0  1.000000  1.000   
45726    71.0   279.0  2200.0    0.0  1.0  0.0    1.0  0.944444  0.000   
45727    81.0   577.0  1300.0   62.0  1.0  0.0  449.0  0.000000  0.333   

              erl   erc     lt    ahc     pr     fo        cs     avgtime  \
0       55.119999  2.02  0.000  2.

## Training

In [7]:
# Default tree
X, y = train_df.iloc[:, :-2], train_df.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


## Evaluation

In [8]:
X_val, y_val = validation_df.iloc[:, :-2], validation_df.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.85      0.85      9738
           1       0.85      0.86      0.86      9860

    accuracy                           0.85     19598
   macro avg       0.85      0.85      0.85     19598
weighted avg       0.85      0.85      0.85     19598



## First experiment with custom features

In [9]:
custom_dataset = csv_importer("./dataset/sources/user_fake_authentic_2class.csv")

custom_fake = custom_dataset[:idx]
custom_correct = custom_dataset[idx:]

random.shuffle(custom_fake)
random.shuffle(custom_correct)

custom_train = custom_fake[:int(len(custom_fake) * (PERCENT_TRAIN / 100))]
custom_train += custom_correct[:int(len(custom_correct) * (PERCENT_TRAIN / 100))]

custom_validation = custom_fake[int(len(custom_fake) * (PERCENT_TRAIN / 100)):]
custom_validation += custom_correct[int(len(custom_correct) * (PERCENT_TRAIN / 100)):]

random.shuffle(custom_train)
random.shuffle(custom_validation)

print("Loading complete.")

train_df = pd.DataFrame.from_dict(train)
validation_df = pd.DataFrame.from_dict(validation)
#print(train_df)
#print(validation_df)

custom_train_df = pd.DataFrame.from_dict(custom_train)
custom_validation_df = pd.DataFrame.from_dict(custom_validation)
#print(custom_train_df)
#print(custom_validation_df)

# Custom tree
cX, cy = custom_train_df.iloc[:,:-2], custom_train_df.iloc[:,-1]
cclf = tree.DecisionTreeClassifier()
cclf = cclf.fit(cX, cy)
print("Fitting complete.")

cX_val, cy_val = custom_validation_df.iloc[:,:-2], validation_df.iloc[:, -1]
cy_pred = cclf.predict(cX_val)

print(metrics.classification_report(cy_val,cy_pred))

Now loading from file ./dataset/sources/user_fake_authentic_2class.csv...
Loaded 65327 entries from source ./dataset/sources/user_fake_authentic_2class.csv
Loading complete.
Fitting complete.
              precision    recall  f1-score   support

           0       0.50      0.49      0.49      9738
           1       0.50      0.51      0.51      9860

    accuracy                           0.50     19598
   macro avg       0.50      0.50      0.50     19598
weighted avg       0.50      0.50      0.50     19598



## Evaluate impact upon removing single-attributes

### nmedia

In [10]:
print(train_df.columns)
#print(train_df)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove nmedia column from training and validation dataframes

In [14]:
train_drop_nmedia = train_df.drop(['nmedia'], axis=1)
validation_drop_nmedia = validation_df.drop(['nmedia'], axis=1)
#print(train_drop_nmedia)

Training

In [15]:
X, y = train_drop_nmedia.iloc[:, :-2], train_drop_nmedia.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [16]:
X_val, y_val = validation_drop_nmedia.iloc[:, :-2], validation_drop_nmedia.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85      9738
           1       0.85      0.86      0.85      9860

    accuracy                           0.85     19598
   macro avg       0.85      0.85      0.85     19598
weighted avg       0.85      0.85      0.85     19598



CONCLUSION: removing nmedia have a very small (bad) impact on performance

### flw

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [17]:
train_drop = train_df.drop(['flw'], axis=1)
validation_drop = validation_df.drop(['flw'], axis=1)

Training

In [18]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [20]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.83      0.84      9738
           1       0.83      0.85      0.84      9860

    accuracy                           0.84     19598
   macro avg       0.84      0.84      0.84     19598
weighted avg       0.84      0.84      0.84     19598



CONCLUSION: removing flw has a (bad) impact on performace

### flg

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [21]:
train_drop = train_df.drop(['flg'], axis=1)
validation_drop = validation_df.drop(['flg'], axis=1)

Training

In [22]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [23]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.79      0.80      9738
           1       0.80      0.81      0.80      9860

    accuracy                           0.80     19598
   macro avg       0.80      0.80      0.80     19598
weighted avg       0.80      0.80      0.80     19598



CONCLUSION: removing flg has a big (bad) impact on performace

### biol

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [24]:
train_drop = train_df.drop(['biol'], axis=1)
validation_drop = validation_df.drop(['biol'], axis=1)

Training

In [25]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [26]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85      9738
           1       0.85      0.86      0.85      9860

    accuracy                           0.85     19598
   macro avg       0.85      0.85      0.85     19598
weighted avg       0.85      0.85      0.85     19598



CONCLUSION: removing biol has a small impact on performace (better f1-score, worse recall)

### pic

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [27]:
train_drop = train_df.drop(['pic'], axis=1)
validation_drop = validation_df.drop(['pic'], axis=1)

Training

In [28]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [29]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86      9738
           1       0.85      0.86      0.86      9860

    accuracy                           0.86     19598
   macro avg       0.86      0.86      0.86     19598
weighted avg       0.86      0.86      0.86     19598



CONCLUSION: removing pic has a small (positive) impact on performace

### url

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [30]:
train_drop = train_df.drop(['url'], axis=1)
validation_drop = validation_df.drop(['url'], axis=1)

Training

In [31]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [32]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.81      0.81      9738
           1       0.81      0.81      0.81      9860

    accuracy                           0.81     19598
   macro avg       0.81      0.81      0.81     19598
weighted avg       0.81      0.81      0.81     19598



CONCLUSION: removing url has a big (bad) impact on performace

### cl

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [33]:
train_drop = train_df.drop(['cl'], axis=1)
validation_drop = validation_df.drop(['cl'], axis=1)

Training

In [34]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [35]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.85      0.85      9738
           1       0.85      0.86      0.86      9860

    accuracy                           0.86     19598
   macro avg       0.86      0.86      0.86     19598
weighted avg       0.86      0.86      0.86     19598



CONCLUSION: removing cl has a small (positive) impact on performace

### cz

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [36]:
train_drop = train_df.drop(['cz'], axis=1)
validation_drop = validation_df.drop(['cz'], axis=1)

Training

In [37]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [38]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85      9738
           1       0.85      0.86      0.86      9860

    accuracy                           0.85     19598
   macro avg       0.85      0.85      0.85     19598
weighted avg       0.85      0.85      0.85     19598



CONCLUSION: removing cz has a small (bad) impact on performace

### ni

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [39]:
train_drop = train_df.drop(['ni'], axis=1)
validation_drop = validation_df.drop(['ni'], axis=1)

Training

In [40]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [41]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86      9738
           1       0.86      0.86      0.86      9860

    accuracy                           0.86     19598
   macro avg       0.86      0.86      0.86     19598
weighted avg       0.86      0.86      0.86     19598



CONCLUSION: removing ni has a small (positive) impact on performace

### erl

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [42]:
train_drop = train_df.drop(['erl'], axis=1)
validation_drop = validation_df.drop(['erl'], axis=1)

Training

In [43]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [44]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      9738
           1       0.84      0.85      0.84      9860

    accuracy                           0.84     19598
   macro avg       0.84      0.84      0.84     19598
weighted avg       0.84      0.84      0.84     19598



CONCLUSION: removing erl has a (bad) impact on performace

### erc

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [45]:
train_drop = train_df.drop(['erc'], axis=1)
validation_drop = validation_df.drop(['erc'], axis=1)

Training

In [46]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [47]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.82      0.82      9738
           1       0.82      0.84      0.83      9860

    accuracy                           0.83     19598
   macro avg       0.83      0.83      0.83     19598
weighted avg       0.83      0.83      0.83     19598



CONCLUSION: removing erc has a (bad) impact on performace

### lt

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [48]:
train_drop = train_df.drop(['lt'], axis=1)
validation_drop = validation_df.drop(['lt'], axis=1)

Training

In [49]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [50]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      9738
           1       0.86      0.86      0.86      9860

    accuracy                           0.86     19598
   macro avg       0.86      0.86      0.86     19598
weighted avg       0.86      0.86      0.86     19598



CONCLUSION: removing lt has a (positive) impact on performace

### ahc

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [51]:
train_drop = train_df.drop(['ahc'], axis=1)
validation_drop = validation_df.drop(['ahc'], axis=1)

Training

In [52]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [53]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86      9738
           1       0.85      0.87      0.86      9860

    accuracy                           0.86     19598
   macro avg       0.86      0.86      0.86     19598
weighted avg       0.86      0.86      0.86     19598



CONCLUSION: removing ahc has a (positive) impact on performace

### pr

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [54]:
train_drop = train_df.drop(['pr'], axis=1)
validation_drop = validation_df.drop(['pr'], axis=1)

Training

In [55]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [56]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85      9738
           1       0.85      0.86      0.86      9860

    accuracy                           0.85     19598
   macro avg       0.85      0.85      0.85     19598
weighted avg       0.85      0.85      0.85     19598



CONCLUSION: removing pr has a small (bad) impact on performace

### fo

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [57]:
train_drop = train_df.drop(['fo'], axis=1)
validation_drop = validation_df.drop(['fo'], axis=1)

Training

In [58]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [59]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.85      0.85      9738
           1       0.85      0.86      0.86      9860

    accuracy                           0.85     19598
   macro avg       0.85      0.85      0.85     19598
weighted avg       0.85      0.85      0.85     19598



CONCLUSION: removing fo has no impact on performace

### cs

In [None]:
print(train_df.columns)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [60]:
train_drop = train_df.drop(['cs'], axis=1)
validation_drop = validation_df.drop(['cs'], axis=1)

Training

In [61]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [62]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85      9738
           1       0.85      0.86      0.86      9860

    accuracy                           0.85     19598
   macro avg       0.85      0.85      0.85     19598
weighted avg       0.85      0.85      0.85     19598



CONCLUSION: removing cs has a small (bad) impact on performace

### avgtime

In [None]:
print(train_df.columns)
#print(train_df)

Index(['nmedia', 'flw', 'flg', 'biol', 'pic', 'url', 'cl', 'cz', 'ni', 'erl',
       'erc', 'lt', 'ahc', 'pr', 'fo', 'cs', 'avgtime', 'fake'],
      dtype='object')


Remove flw column from training and validation dataframes

In [63]:
train_drop = train_df.drop(['avgtime'], axis=1)
validation_drop = validation_df.drop(['avgtime'], axis=1)

Training

In [64]:
X, y = train_drop.iloc[:, :-2], train_drop.iloc[:, -1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)
print("Fitting complete.")

Fitting complete.


Evaluation

In [65]:
X_val, y_val = validation_drop.iloc[:, :-2], validation_drop.iloc[:, -1]
y_pred = clf.predict(X_val)
print(metrics.classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85      9738
           1       0.85      0.86      0.85      9860

    accuracy                           0.85     19598
   macro avg       0.85      0.85      0.85     19598
weighted avg       0.85      0.85      0.85     19598



CONCLUSION: removing flw has a small (bad) impact on performace

## Second experiment with custom features

In this experiment we fit a Decision Tree Classifier removing from dataframes the attributes which seemed to worsen performance during the experiments

In [68]:
custom_train_df = train_df.drop(['pic','cl','ni','lt','ahc'], axis=1)
custom_validation_df = validation_df.drop(['pic','cl','ni','lt','ahc'], axis=1)

cX, cy = custom_train_df.iloc[:,:-2], custom_train_df.iloc[:,-1]
cclf = tree.DecisionTreeClassifier()
cclf = cclf.fit(cX, cy)
print("Fitting complete.")

cX_val, cy_val = custom_validation_df.iloc[:,:-2], validation_df.iloc[:, -1]
cy_pred = cclf.predict(cX_val)

print(metrics.classification_report(cy_val,cy_pred))

Fitting complete.
              precision    recall  f1-score   support

           0       0.86      0.85      0.86      9738
           1       0.85      0.87      0.86      9860

    accuracy                           0.86     19598
   macro avg       0.86      0.86      0.86     19598
weighted avg       0.86      0.86      0.86     19598



CONCLUSION: removing pic, cl, ni, lt and ahc columns improved performances !