In [1]:
import pandas as pd
import json
import os
import requests
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/roger/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Data retrieval

## Create dataframes

> One dataframe containing numeric data, and another one with text data

In [3]:
# dataset_numeric = pd.DataFrame(columns=['following', 'followers', 'username_length', 'full_name_length', 'description_length', 'username_has_number', 'full_name_has_number', 'description_has_number', 'is_fake'])
# dataset_text = pd.DataFrame(columns=['username', 'full_name', 'description', 'is_fake'])
dataset_mix = pd.DataFrame(columns=['following', 'followers', 'username_length', 'full_name_length', 'description_length', 'username_has_number', 'full_name_has_number', 'description_has_number', 'username', 'full_name', 'description', 'is_fake'])

## Load file & parse json data

#### Process fake account data files

In [4]:
for filename in os.scandir('data/fake'):
    if filename.is_file():
        fake_data_file = open(filename.path)
        fake_json = json.load(fake_data_file)
        user_data = fake_json['graphql']['user']

        following = user_data['edge_follow']['count']
        followers = user_data['edge_followed_by']['count']

        username_text = user_data['username']
        full_name_text = user_data['full_name']
        description_text = user_data['biography']
        
        username_length = len(user_data['username'])
        full_name_length = len(user_data['full_name'])
        description_length = len(user_data['biography'])
        
        username_has_number = any(char.isdigit() for char in user_data['username'])
        full_name_has_number = any(char.isdigit() for char in user_data['full_name'])
        description_has_number = any(char.isdigit() for char in user_data['biography'])
        
        is_fake = True
        
        # row_numeric = {
        #     'following': following,
        #     'followers': followers,
        #     'username_length': username_length,
        #     'full_name_length': full_name_length, 
        #     'description_length': description_length, 
        #     'username_has_number': username_has_number, 
        #     'full_name_has_number': full_name_has_number,
        #     'description_has_number': description_has_number, 
        #     'is_fake': is_fake
        # }

        # row_text = {
        #     'username': username_text,
        #     'full_name': full_name_text,
        #     'description': description_text,
        #     'is_fake': is_fake
        # }

        row_mix = {
            'following': following,
            'followers': followers,
            'username_length': username_length,
            'full_name_length': full_name_length,
            'description_length': description_length,
            'username_has_number': username_has_number,
            'full_name_has_number': full_name_has_number,
            'description_has_number': description_has_number,
            'username': username_text,
            'full_name': full_name_text,
            'description': description_text,
            'is_fake': is_fake
        }
        
        # dataset_numeric = pd.concat([dataset_numeric, pd.DataFrame([row_numeric])], ignore_index=True)
        # dataset_text = pd.concat([dataset_text, pd.DataFrame([row_text])], ignore_index=True)
        dataset_mix = pd.concat([dataset_mix, pd.DataFrame([row_mix])], ignore_index=True)

#### Process real account data files

In [5]:
for filename in os.scandir('data/real'):
    if filename.is_file():
        real_data_file = open(filename.path)
        real_json = json.load(real_data_file)

        following = real_json['numberFollowing']
        followers = real_json['numberFollowers']
        
        username_length = len(real_json['alias'])
        username_has_number = any(char.isdigit() for char in real_json['alias'])
        username_text = real_json['alias']
        
        full_name = real_json['username']
        if full_name is not None:
            full_name_length = len(full_name)
            full_name_has_number = any(char.isdigit() for char in full_name)
            full_name_text = full_name
        else:
            full_name_length = 0
            full_name_has_number = False

        description = real_json['descriptionProfile']
        if description is not None:
            description_length = len(description[0])
            description_has_number = any(char.isdigit() for char in description[0])
            description_text = description[0]
        else:
            description_length = 0
            description_has_number = False

        is_fake = False

        # row_numeric = {
        #     'following': following,
        #     'followers': followers,
        #     'username_length': username_length,
        #     'full_name_length': full_name_length, 
        #     'description_length': description_length, 
        #     'username_has_number': username_has_number, 
        #     'full_name_has_number': full_name_has_number,
        #     'description_has_number': description_has_number, 
        #     'is_fake': is_fake
        # }

        # row_text = {
        #     'username': username_text,
        #     'full_name': full_name_text,
        #     'description': description_text,
        #     'is_fake': is_fake
        # }

        row_mix = {
            'following': following,
            'followers': followers,
            'username_length': username_length,
            'full_name_length': full_name_length,
            'description_length': description_length,
            'username_has_number': username_has_number,
            'full_name_has_number': full_name_has_number,
            'description_has_number': description_has_number,
            'username': username_text,
            'full_name': full_name_text,
            'description': description_text,
            'is_fake': is_fake
        }
        
        # dataset_numeric = pd.concat([dataset_numeric, pd.DataFrame([row_numeric])], ignore_index=True)
        # dataset_text = pd.concat([dataset_text, pd.DataFrame([row_text])], ignore_index=True)
        dataset_mix = pd.concat([dataset_mix, pd.DataFrame([row_mix])], ignore_index=True)

# Data cleaning

## Remove empty descriptions of the text dataset

In [6]:
# dataset_text = dataset_text[dataset_text['description'].str.len() > 0]
dataset_mix = dataset_mix[dataset_mix['description'].str.len() > 0]

## Translate descriptions to english

> Use wisely, API consumes money from google cloud free trial ($300)

In [7]:
# lemmatizer = WordNetLemmatizer()

# for index, row in dataset_text.iterrows():
#     body = {
#         'q': row['description'],
#         'target': 'en',
#         'key': 'AIzaSyAGzUMVuHzalVmFJsoBN9jyQZWHs2aY1Kg'
#     }
    
#     res = requests.post('https://translation.googleapis.com/language/translate/v2?key=AIzaSyAGzUMVuHzalVmFJsoBN9jyQZWHs2aY1Kg', json=body)
#     res_json = json.loads(res.text)
#     translated_text = res_json['data']['translations'][0]['translatedText']

#     cleaned_text = re.sub('[^a-zA-Z0-9\s]', '', translated_text).lower()
#     stop_words = list(set(stopwords.words('english')))
#     word_tokens = word_tokenize(cleaned_text)
#     array_sentence = [w for w in word_tokens if not w in stop_words]
#     lemmatized_tokens = [lemmatizer.lemmatize(word) for word in array_sentence]
#     text = ' '.join(lemmatized_tokens)
#     no_url_text = re.sub(r'http\S+|www.\S+', '', text)

#     dataset_text.at[index, 'description'] = no_url_text

In [8]:
lemmatizer = WordNetLemmatizer()

for index, row in dataset_mix.iterrows():
    body = {
        'q': row['description'],
        'target': 'en',
        'key': 'AIzaSyAGzUMVuHzalVmFJsoBN9jyQZWHs2aY1Kg'
    }
    
    res = requests.post('https://translation.googleapis.com/language/translate/v2?key=AIzaSyAGzUMVuHzalVmFJsoBN9jyQZWHs2aY1Kg', json=body)
    res_json = json.loads(res.text)
    translated_text = res_json['data']['translations'][0]['translatedText']

    cleaned_text = re.sub('[^a-zA-Z0-9\s]', '', translated_text).lower()
    stop_words = list(set(stopwords.words('english')))
    word_tokens = word_tokenize(cleaned_text)
    array_sentence = [w for w in word_tokens if not w in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in array_sentence]
    text = ' '.join(lemmatized_tokens)
    no_url_text = re.sub(r'http\S+|www.\S+', '', text)

    dataset_mix.at[index, 'description'] = no_url_text

In [9]:
# dataset_text = dataset_text[dataset_text['description'].str.len() > 2]
dataset_mix = dataset_mix[dataset_mix['description'].str.len() > 2]

## Balance dataset

In [10]:
# g = dataset_numeric.groupby('is_fake')
# balanced_dataset_numeric = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

# g = dataset_text.groupby('is_fake')
# balanced_dataset_text = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

g = dataset_mix.groupby('is_fake')
balanced_dataset_mix = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

## Split numeric and text datasets

In [11]:
dataset_numeric = balanced_dataset_mix[['following', 'followers', 'username_length', 'full_name_length', 'description_length', 'username_has_number', 'full_name_has_number', 'description_has_number', 'is_fake']]
dataset_text = balanced_dataset_mix[['username', 'full_name', 'description', 'is_fake']]

## Min-Max normalization of the numeric dataset

In [12]:
dataset_numeric = (dataset_numeric-dataset_numeric.min())/(dataset_numeric.max()-dataset_numeric.min())

In [13]:
dataset_numeric

Unnamed: 0_level_0,Unnamed: 1_level_0,following,followers,username_length,full_name_length,description_length,username_has_number,full_name_has_number,description_has_number,is_fake
is_fake,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
False,0,0.072647,0.113159,0.26087,0.333333,0.68,0.0,0.0,1.0,0.0
False,1,0.000267,0.550153,0.347826,0.866667,0.306667,0.0,0.0,0.0,0.0
False,2,0.039456,0.040202,0.391304,0.433333,0.22,0.0,0.0,0.0,0.0
False,3,0.07478,0.210061,0.391304,0.4,0.473333,0.0,0.0,0.0,0.0
False,4,0.081445,0.041116,0.565217,0.5,0.34,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
True,233,0.825913,0.000012,0.478261,0.0,0.146667,1.0,0.0,0.0,1.0
True,234,0.144495,0.000005,0.347826,0.0,0.506667,0.0,0.0,0.0,1.0
True,235,0.067982,0.000002,0.304348,0.3,0.206667,1.0,0.0,1.0,1.0
True,236,0.109704,0.000006,0.391304,0.4,0.086667,1.0,1.0,0.0,1.0


# Store results

In [14]:
balanced_dataset_mix.to_csv('data/dataset_mix.csv', index=False)
dataset_numeric.to_csv('data/dataset_numeric.csv', index=False)
dataset_text.to_csv('data/dataset_text.csv', index=False)