In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from datetime import datetime
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

<h1>Import Dataset</h1>

In [None]:
df_users = pd.read_csv("./data/combined_twitter_data_with_tweets_corpus.csv")

In [None]:
df_users.columns

In [None]:
# get tweets df
# save tweets dataset into local 
filenames_tweets = [
    "data/all tweets 2017/tweets_fake_followers.csv",
    "data/all tweets 2017/tweets_genuine_accounts.csv",
    "data/all tweets 2017/tweets_social_spambots_1.csv",
    "data/all tweets 2017/tweets_social_spambots_2.csv",
    "data/all tweets 2017/tweets_social_spambots_3.csv",
    "data/all tweets 2017/tweets_traditional_spambots_1.csv",

    "data/tweets 2015/tweets_E13.csv",
    "data/tweets 2015/tweets_FSF.csv",
    "data/tweets 2015/tweets_INT.csv",
    "data/tweets 2015/tweets_TFP.csv",
    "data/tweets 2015/tweets_TWT.csv"
]

for i,fn in enumerate(filenames_tweets):
    if i == 0:
        df_tweets = pd.read_csv(fn, encoding='ISO-8859-1')
    else:
        df_tweets = pd.concat([df_tweets, pd.read_csv(fn, encoding='ISO-8859-1') ], axis=0)


In [None]:
df_tweets = df_tweets.dropna(subset = ["user_id"])  
df_tweets["user_id"] = df_tweets["user_id"].apply(int)

<h2>Train Test Split (70-15-15)</h2>

In [None]:
#get the target variable - real or fake account type - binary classification problem
df_users = df_users[(df_users['account_type'] == "real") | (df_users['account_type'] == "fake")]
print(df_users['account_type'].value_counts())
df_users['account_type'] = df_users['account_type'].apply(lambda x: 0 if x=="fake" else 1)

train, test = train_test_split(df, test_size=0.15, random_state=69, stratify=df['account_type'])

In [None]:
print("train size:", len(train))
print("test size", len(test))

In [None]:
train['account_type'].value_counts()

<h2>Date Formatting</h2>

In [None]:
# takes around 10 min to run
df_users_train['created_at_formatted'] = pd.to_datetime(df_tweets['timestamp'], infer_datetime_format=True, errors='coerce')

In [None]:
df_train['created_at_date'] = df_train['created_at_formatted'].apply(lambda x: x.date())

<h2>Feature Engineering</h2>

## User tweet frequency, tags and mentions

In [None]:
def tweet_freq(df):
    # user tweet frequency = total number of tweets / number of user active days 
    # shows how often the user tweets among the days that a user tweets at least once. User activity is defined by whether the user tweets in a given day
    # 1 = user tweets only once per active day 
    # >1 = user tweets more than once a day on average, in the days that the user is active 
    
    df_tweets_per_day = df.groupby(by=["user_id"]).agg(tweet_count=('text', 'count'),
                                                          date_count=('created_at_date', lambda x: x.nunique()))
    dict_tweets_average = {user_id: df_tweets_per_day.loc[user_id]['tweet_count'] / df_tweets_per_day.loc[user_id]['date_count'] for user_id in df_tweets_per_day.index}
    
    
    #create new column for user tweet frequency
    df['tweet_frequency'] = df['id'].map(dict_tweets_average)
    
    # average number of tags per post = total number of tags used per tweet 
    # average number of mentions per post = total number of mentions per tweet 

    df['text'] = df['text'].apply(str) #convert all text to string
    df['number_of_tags'] = df['text'].apply(lambda x: x.count("#"))
    df['number_of_mentions'] = df['text'].apply(lambda x: x.count("@"))
    
    return df


## Export to CSV

In [None]:
train.to_csv("data/twitter_data_train.csv", index=False)
test.to_csv("data/twitter_data_test.csv", index=False)