In [1]:
import pandas as pd
import json
import os
import requests

# Data retrieval

## Create dataframes

> One dataframe containing numeric data, and another one with text data

In [2]:
dataset_numeric = pd.DataFrame(columns=['following', 'followers', 'username_length', 'full_name_length', 'description_length', 'username_has_number', 'full_name_has_number', 'description_has_number', 'is_fake'])
dataset_text = pd.DataFrame(columns=['username', 'full_name', 'description', 'is_fake'])

## Load file & parse json data

#### Process fake account data files

In [3]:
for filename in os.scandir('data/fake'):
    if filename.is_file():
        fake_data_file = open(filename.path)
        fake_json = json.load(fake_data_file)
        user_data = fake_json['graphql']['user']

        following = user_data['edge_follow']['count']
        followers = user_data['edge_followed_by']['count']

        username_text = user_data['username']
        full_name_text = user_data['full_name']
        description_text = user_data['biography']
        
        username_length = len(user_data['username'])
        full_name_length = len(user_data['full_name'])
        description_length = len(user_data['biography'])
        
        username_has_number = any(char.isdigit() for char in user_data['username'])
        full_name_has_number = any(char.isdigit() for char in user_data['full_name'])
        description_has_number = any(char.isdigit() for char in user_data['biography'])
        
        is_fake = True
        
        row_numeric = {
            'following': following,
            'followers': followers,
            'username_length': username_length,
            'full_name_length': full_name_length, 
            'description_length': description_length, 
            'username_has_number': username_has_number, 
            'full_name_has_number': full_name_has_number,
            'description_has_number': description_has_number, 
            'is_fake': is_fake
        }

        row_text = {
            'username': username_text,
            'full_name': full_name_text,
            'description': description_text,
            'is_fake': is_fake
        }
        
        dataset_numeric = pd.concat([dataset_numeric, pd.DataFrame([row_numeric])], ignore_index=True)
        dataset_text = pd.concat([dataset_text, pd.DataFrame([row_text])], ignore_index=True)

#### Process real account data files

In [4]:
for filename in os.scandir('data/real'):
    if filename.is_file():
        real_data_file = open(filename.path)
        real_json = json.load(real_data_file)

        following = real_json['numberFollowing']
        followers = real_json['numberFollowers']
        
        username_length = len(real_json['alias'])
        username_has_number = any(char.isdigit() for char in real_json['alias'])
        username_text = real_json['alias']
        
        full_name = real_json['username']
        if full_name is not None:
            full_name_length = len(full_name)
            full_name_has_number = any(char.isdigit() for char in full_name)
            full_name_text = full_name
        else:
            full_name_length = 0
            full_name_has_number = False

        description = real_json['descriptionProfile']
        if description is not None:
            description_length = len(description[0])
            description_has_number = any(char.isdigit() for char in description[0])
            description_text = description[0]
        else:
            description_length = 0
            description_has_number = False

        is_fake = False

        row_numeric = {
            'following': following,
            'followers': followers,
            'username_length': username_length,
            'full_name_length': full_name_length, 
            'description_length': description_length, 
            'username_has_number': username_has_number, 
            'full_name_has_number': full_name_has_number,
            'description_has_number': description_has_number, 
            'is_fake': is_fake
        }

        row_text = {
            'username': username_text,
            'full_name': full_name_text,
            'description': description_text,
            'is_fake': is_fake
        }
        
        dataset_numeric = pd.concat([dataset_numeric, pd.DataFrame([row_numeric])], ignore_index=True)
        dataset_text = pd.concat([dataset_text, pd.DataFrame([row_text])], ignore_index=True)

# Data cleaning

## Min-Max normalization of the numeric dataset

In [6]:
dataset_numeric = (dataset_numeric-dataset_numeric.min())/(dataset_numeric.max()-dataset_numeric.min())

## Remove empty descriptions of the text dataset

In [7]:
dataset_text = dataset_text[dataset_text['description'].str.len() > 0]

## Translate descriptions to english

> Use wisely, API consumes money from google cloud free trial ($300)

In [8]:
for index, row in dataset_text.iterrows():
    body = {
        'q': row['description'],
        'target': 'en',
        'key': 'AIzaSyAGzUMVuHzalVmFJsoBN9jyQZWHs2aY1Kg'
    }
    
    res = requests.post('https://translation.googleapis.com/language/translate/v2?key=AIzaSyAGzUMVuHzalVmFJsoBN9jyQZWHs2aY1Kg', json=body)
    res_json = json.loads(res.text)
    translated_text = res_json['data']['translations'][0]['translatedText']
    dataset_text.at[index, 'description'] = translated_text

In [9]:
dataset_text

Unnamed: 0,username,full_name,description,is_fake
3,takpar7173,takpar----😎,♥~We are who we are~♥ ♥~We are not horns becau...,True
7,ali_nourifard,Ali noorifard,#كانكور_اسان_است #منوه_تدريس To buy educationa...,True
10,narmak_ma,نارمک ما,The page of the residents and lovers of the ol...,True
11,khajeh1984,Mohammad Khajeh,"Be well away from the filthy world, Look at yo...",True
18,_lli9k,ترف.,"As long as I am drawing, I am alive.",True
...,...,...,...,...
1666,mauricetravelphotos,Travel • Nature • Vacation,👇🏻Travel the world with the cheapest tickets:,False
1667,whereloveisillegal,Where Love Is Illegal,Documenting & sharing LGBTIQ+ stories of survi...,False
1668,aka.the.one,LUNA I'NOOR🌙,Lifestyle VLOGS 🇦🇿🇺🇸 for business inquires lun...,False
1669,songofstyle,Aimee Song,[Ah-Mee] rhymes with Mommy 송아미 New York Times ...,False


## Balance dataset

In [10]:
g = dataset_numeric.groupby('is_fake')
balanced_dataset_numeric = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

g = dataset_text.groupby('is_fake')
balanced_dataset_text = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

# Store results

In [11]:
balanced_dataset_numeric.to_csv('data/dataset_numeric.csv', index=False)
balanced_dataset_text.to_csv('data/dataset_text.csv', index=False)