In [3]:
import ast
import re
import nltk
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from datetime import datetime
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import pickle

<h1>Import Dataset</h1>

In [6]:
base = "/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4012/Data/"
has_face_pkl_path = "/Users/ivankoh/Documents/GitHub/Fake-Twitter-Account-Detection/data/has_face.pkl"

In [9]:
df_users = pd.read_csv(base + "/combined_twitter_data_with_tweets_corpus.csv")

In [10]:
df_users.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'name', 'screen_name',
       'statuses_count', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'url', 'lang', 'time_zone',
       'location', 'default_profile', 'default_profile_image', 'geo_enabled',
       'profile_image_url', 'profile_banner_url',
       'profile_use_background_image', 'profile_background_image_url_https',
       'profile_text_color', 'profile_image_url_https',
       'profile_sidebar_border_color', 'profile_background_tile',
       'profile_sidebar_fill_color', 'profile_background_image_url',
       'profile_background_color', 'profile_link_color', 'utc_offset',
       'protected', 'verified', 'description', 'created_at', 'updated',
       'account_type', 'tweets_list'],
      dtype='object')

In [13]:
# get tweets df
# save tweets dataset into local 
filenames_tweets = [
    "all tweets 2017/tweets_fake_followers.csv",
    "all tweets 2017/tweets_genuine_accounts.csv",
    "all tweets 2017/tweets_social_spambots_1.csv",
    "all tweets 2017/tweets_social_spambots_2.csv",
    "all tweets 2017/tweets_social_spambots_3.csv",
    "all tweets 2017/tweets_traditional_spambots_1.csv",

    "tweets 2015/tweets_E13.csv",
    "tweets 2015/tweets_FSF.csv",
    "tweets 2015/tweets_INT.csv",
    "tweets 2015/tweets_TFP.csv",
    "tweets 2015/tweets_TWT.csv"
]
#filenames_tweets = map(lambda x: "data/"+ x, filenames_tweets)
filenames_tweets = map(lambda x: base + x, filenames_tweets)
for i,fn in enumerate(filenames_tweets):
    if i == 0:
        df_tweets = pd.read_csv(fn, encoding='ISO-8859-1')
    else:
        df_tweets = pd.concat([df_tweets, pd.read_csv(fn, encoding='ISO-8859-1') ], axis=0)


  df_tweets = pd.read_csv(fn, encoding='ISO-8859-1')
  df_tweets = pd.concat([df_tweets, pd.read_csv(fn, encoding='ISO-8859-1') ], axis=0)
  df_tweets = pd.concat([df_tweets, pd.read_csv(fn, encoding='ISO-8859-1') ], axis=0)
  df_tweets = pd.concat([df_tweets, pd.read_csv(fn, encoding='ISO-8859-1') ], axis=0)
  df_tweets = pd.concat([df_tweets, pd.read_csv(fn, encoding='ISO-8859-1') ], axis=0)
  df_tweets = pd.concat([df_tweets, pd.read_csv(fn, encoding='ISO-8859-1') ], axis=0)


In [14]:
df_tweets = df_tweets.dropna(subset = ["user_id"])  
df_tweets["user_id"] = df_tweets["user_id"].apply(int)

## Data Cleaning

### Remove columns that are redundant
Data is redundant in helping us with our problem statement when:
- The data is metadata
- There are too many unique categories

In [15]:
remove_list = ['Unnamed: 0.1', 'Unnamed: 0', 'lang', 'time_zone', 'location', 'profile_banner_url', 'profile_background_image_url_https',
       'profile_text_color', 'profile_image_url_https', 'profile_sidebar_border_color', 'profile_sidebar_fill_color',
       'profile_background_image_url', 'profile_background_color', 'profile_link_color', 'utc_offset', 'created_at', 'updated']

In [16]:
df_users = df_users.drop(remove_list, axis = 1)

### Replace NaN values with zeros for binary columns

In [17]:
df_users['default_profile'] = df_users['default_profile'].fillna(0)
df_users['default_profile_image'] = df_users['default_profile_image'].fillna(0)
df_users['geo_enabled'] = df_users['geo_enabled'].fillna(0)
df_users['default_profile_image'] = df_users['default_profile_image'].fillna(0)
df_users['profile_use_background_image'] = df_users['profile_use_background_image'].fillna(0)
df_users['profile_background_tile'] = df_users['profile_background_tile'].fillna(0)

<h2>Train Test Split (85-15)</h2>

In [18]:
#get the target variable - real or fake account type - binary classification problem
df_users = df_users[(df_users['account_type'] == "real") | (df_users['account_type'] == "fake")]
print(df_users['account_type'].value_counts())
df_users['account_type'] = df_users['account_type'].apply(lambda x: 0 if x=="fake" else 1)

train, test = train_test_split(df_users, test_size=0.15, random_state=69, stratify=df_users['account_type'])

fake    8362
real    2751
Name: account_type, dtype: int64


In [19]:
print("train size:", len(train))
print("test size", len(test))

train size: 9446
test size 1667


In [20]:
train['account_type'].value_counts()

0    7108
1    2338
Name: account_type, dtype: int64

<h2>Date Formatting</h2>

In [21]:
# takes around 10 min to run
df_tweets['created_at_formatted'] = pd.to_datetime(df_tweets['timestamp'], infer_datetime_format=True, errors='coerce')

In [22]:
df_tweets['created_at_date'] = df_tweets['created_at_formatted'].apply(lambda x: x.date())

<h2>Tweet features</h2>

In [23]:
def tweet_freq(df_users, df_tweets):
    
    # user tweet frequency = total number of tweets / number of user active days 
    # shows how often the user tweets among the days that a user tweets at least once. User activity is defined by whether the user tweets in a given day
    # 1 = user tweets only once per active day 
    # >1 = user tweets more than once a day on average, in the days that the user is active 

    df_tweets_per_day = df_tweets.groupby(by=["user_id"]).agg(tweet_count=('text', 'count'),
                                                              date_count=('created_at_date', lambda x: x.nunique()))

    dict_tweets_average = {user_id: df_tweets_per_day.loc[user_id]['tweet_count'] / df_tweets_per_day.loc[user_id]['date_count'] for user_id in df_tweets_per_day.index}
    #create new column for user tweet frequency 
    df_users['tweet_frequency'] = df_users['id'].map(dict_tweets_average)
    df_users['tweet_frequency'] = df_users['tweet_frequency'].fillna(0)
    return df_users

def tweet_tags_mention(df_users, df_tweets):
    # average number of tags per post = total number of tags used per tweet 
    # average number of mentions per post = total number of mentions per tweet 

    df_tweets['text'] = df_tweets['text'].apply(str) #convert all text to string
    df_tweets['number_of_tags'] = df_tweets['text'].apply(lambda x: x.count("#"))
    df_tweets['number_of_mentions'] = df_tweets['text'].apply(lambda x: x.count("@"))
    tags_dict = df_tweets.groupby(by=["user_id"])['number_of_tags'].sum().to_dict()
    mentions_dict = df_tweets.groupby(by=["user_id"])['number_of_mentions'].sum().to_dict() 

    #create new column for number of tags
    df_users['number_of_tags'] = df_users['id'].map(tags_dict)
    #create new column for number of mentions
    df_users['number_of_mentions'] = df_users['id'].map(mentions_dict)
    
    df_users['number_of_mentions'] = df_users['number_of_mentions'].fillna(0)
    df_users['number_of_tags'] = df_users['number_of_tags'].fillna(0)
    return df_users

In [24]:
# return 0 if weekend, 1 if weekday 
def is_weekday(dt):
    return 0 if dt.weekday() > 4 else 1

# return day of week 
def get_weekday(dt):
    return dt.weekday()

def get_weekend_weekday_frequency(df_tweets, df_users):
    df_tweets['weekday'] = df_tweets['created_at_formatted'].apply(lambda x: is_weekday(x))
    df_tweets_weekday_weekend = df_tweets.groupby(by=["user_id", "weekday"]).agg(tweet_count=('text', 'count'),
                                                          date_count=('created_at_date', lambda x: x.nunique()))
    dict_tweets_weekend = {user_id: df_tweets_weekday_weekend.loc[(user_id, weekday)]['tweet_count'] / df_tweets_weekday_weekend.loc[(user_id, weekday)]['date_count'] for (user_id, weekday) in df_tweets_weekday_weekend.index if weekday == 0}
    df_users['tweet_weekend_frequency'] = df_users['id'].map(dict_tweets_weekend)        
    dict_tweets_weekday = {user_id: df_tweets_weekday_weekend.loc[(user_id, weekday)]['tweet_count'] / df_tweets_weekday_weekend.loc[(user_id, weekday)]['date_count'] for (user_id, weekday) in df_tweets_weekday_weekend.index if weekday == 1}
    df_users['tweet_weekday_frequency'] = df_users['id'].map(dict_tweets_weekday)
    
    df_users['tweet_weekend_frequency'] = df_users['tweet_weekend_frequency'].fillna(0)
    df_users['tweet_weekday_frequency'] = df_users['tweet_weekday_frequency'].fillna(0)
    return df_users      

<h2>Followers To Following Ratio Features

In [25]:
def create_followers_following_ratio(df):
    #followers divide by following (high means popular, low means more following)
    df['following_to_followers_ratio'] = df['friends_count'] / df['followers_count']
    df['following_to_followers_ratio'] = df['following_to_followers_ratio'].fillna(0)
    df['following_to_followers_ratio'] = df['following_to_followers_ratio'].apply(lambda x: 1 if x == np.inf else x)
    return df

<h2>Name Features

In [26]:
def name_features(df):
    #get length of username and screen name
    df['username_length'] = df['name'].apply(lambda x: len(str(x)))
    df['screen_name_length'] = df['screen_name'].apply(lambda x: len(str(x)))
    
    #anything that is not a-z or 0-9 will be blocked, outputs length
    df['username_spec_char_count'] = df['name'].apply(lambda x: len(re.findall(r'[^A-Za-z0-9]+', str(x))))
    df['screen_name_spec_char_count'] = df['screen_name'].apply(lambda x: len(re.findall(r'[^A-Za-z0-9]+', str(x))))
    return df

<h2>Has URL Feature

In [27]:
def has_url_feature(df):
    #1 if has url, 0 if no url
    df['has_url'] = df['url'].apply(lambda x: 0 if pd.isnull(x) else 1)
    return df

<h2>Text Features

In [28]:
def clean_texts(df):
    def process_tweets_list(corpus):
        
        corpus_processed = []
        for tweet_list in corpus:
            tweet_list = str(tweet_list)
            row_processed = ""
            
            #replace RT and @
            row_processed = tweet_list.replace("RT", "" ) 
            row_processed = row_processed.replace("@", "" )
            
            row_processed = re.sub(r'http\S+', "", row_processed) #remove any URLs in tweets
            row_processed = re.sub(r'[^\x00-\x7f]', "", row_processed) #remove Non-ASCII characters
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed if not row_processed == 'nan' else "") # handle NA
            

        return corpus_processed
    
    def process_description(corpus):
        
        corpus_processed = []
        for row in corpus:
            row = str(row)
            row_processed = re.sub(r'[^\x00-\x7f]', "", row) #remove Non-ASCII characters
            row_processed = re.sub(r'[^\w\s]', '', row_processed) # remove punctuation
            corpus_processed.append(row_processed if not row_processed == 'nan' else "") # handle NA
            
        return corpus_processed
    
    df["tweets_list_processed"] = process_tweets_list(df["tweets_list"])
    df["description_processed"] = process_description(df["description"])
    
    return df            

In [29]:
class LemmatizeTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, text):
        return [self.lemmatizer.lemmatize(word) for word in word_tokenize(text)]

In [30]:
def generate_nlp_features(df):
    
    #tweets
    vect_tweets = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )
    
    tweets_processed = pd.Series(df["tweets_list_processed"])
    tfidf_fit_tweets = vect_tweets.fit(tweets_processed)
    tweets_tfidf_array = tfidf_fit_tweets.transform(tweets_processed).toarray()
    tweets_tfidf_df = pd.DataFrame(tweets_tfidf_array)
    tweets_tfidf_df.columns = list(map(lambda x: "tweets_" + str(x), tweets_tfidf_df.columns))
    df = pd.concat([df.reset_index(drop=True),tweets_tfidf_df.reset_index(drop=True)], axis=1)
    
    #description
    vect_description = TfidfVectorizer( 
        tokenizer=LemmatizeTokenizer(),
        lowercase=True,
        analyzer='word', 
        ngram_range=(1,3), # unigram, bigram and trigram 
        max_features=100, # vocabulary that only consider the top max_features ordered by term frequency across the corpus
        min_df=10, # minimum word frequency required to be in model
        stop_words=stopwords.words('english') # remove stopwords
        )
    
    description_processed = pd.Series(df["description_processed"])
    tfidf_fit_description = vect_description.fit(description_processed)
    description_tfidf_array = tfidf_fit_description.transform(description_processed).toarray()
    description_tfidf_df = pd.DataFrame(description_tfidf_array)
    description_tfidf_df.columns = list(map(lambda x: "description_" + str(x), description_tfidf_df.columns))
    df = pd.concat([df.reset_index(drop=True),description_tfidf_df.reset_index(drop=True)], axis=1)
    
    return (df, tfidf_fit_tweets, tfidf_fit_description)

def nlp_transform_test(df, tfidf_fit_tweets, tfidf_fit_description):
    tweets_tfidf_array = tfidf_fit_tweets.transform(df['tweets_list_processed']).toarray()
    tweets_tfidf_df = pd.DataFrame(tweets_tfidf_array)
    tweets_tfidf_df.columns = list(map(lambda x : "tweets_" + str(x), tweets_tfidf_df.columns))
    df = pd.concat([df.reset_index(drop=True),tweets_tfidf_df.reset_index(drop=True)], axis=1)
    
    description_tfidf_array = tfidf_fit_description.transform(df['description_processed']).toarray()
    description_tfidf_df = pd.DataFrame(description_tfidf_array)
    description_tfidf_df.columns = list(map(lambda x : "description_" + str(x), description_tfidf_df.columns))
    df = pd.concat([df.reset_index(drop=True),description_tfidf_df.reset_index(drop=True)], axis=1)
    
    return df

## Has Face Feature (Face Detection with MTCNN)

In [45]:
# import the mapping and attach to df during combine
with open(has_face_pkl_path, 'rb') as f:
    has_face_d = pickle.load(f)

has_face_d = {int(k):v for k,v in has_face_d.items()}

<h2>Combine all Feature Generating Functions</h2>

In [32]:
result = tweet_freq(train, df_tweets)
result = tweet_tags_mention(result, df_tweets)
result = get_weekend_weekday_frequency(df_tweets, result)
result = create_followers_following_ratio(result)
result = name_features(result)
result = has_url_feature(result)
result = clean_texts(result)
result, tfidf_fit_tweets, tfidf_fit_description = generate_nlp_features(result)
result['has_face'] = result['id'].map(has_face_d)



In [33]:
result

Unnamed: 0,id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,listed_count,url,default_profile,...,description_90,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99
0,82118682,davide gazzÃ¨,davidegazze,88,19,39,9,0,,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1174756808,Carolee Moberly,MoberlycikCarol,19,7,192,0,0,,1.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,708732794,Julius Kirk,juliuskirkdoq,22,11,241,0,0,,1.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2360642101,Magan Skripko,MaganSkripko,47,6,41,0,0,,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2379608905,Martin Bruley,MartinBruley,34,4,36,0,0,,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9441,2363037596,Ula Banegas,UlaBanegas,53,6,36,0,0,,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9442,1175035327,Jasmine Finkelstein,FinkelsteinupdJ,22,8,194,0,0,,1.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9443,67631743,TMJ-Morocco Jobs,tmj_mar_jobs1,49,582,494,0,53,https://t.co/DByWt45HZj,0.0,...,0.185918,0.192114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9444,2370803990,Christy Schnicke,ChristySchnicke,64,11,43,0,0,,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Apply same feature engineering on Test data

In [34]:
test = tweet_freq(test, df_tweets)
test = tweet_tags_mention(test, df_tweets)
test = get_weekend_weekday_frequency(df_tweets, test)
test = create_followers_following_ratio(test)
test = name_features(test)
test = has_url_feature(test)
test = clean_texts(test)
test = nlp_transform_test(test, tfidf_fit_tweets, tfidf_fit_description)
test['has_face'] = test['id'].map(has_face_d)

In [35]:
print(len(test.columns))
print(len(result.columns))

for i in result.columns:
    if i not in test.columns:
        print(i)

234
233


In [36]:
test.tweets_99.describe()

count    1667.000000
mean        0.041574
std         0.073182
min         0.000000
25%         0.000000
50%         0.000000
75%         0.057944
max         0.602594
Name: tweets_99, dtype: float64

## Export to CSV

In [37]:
result.to_csv("data/twitter_data_train.csv", index=False)
test.to_csv("data/twitter_data_test.csv", index=False)

In [38]:
result[result.id==1]

Unnamed: 0,id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,listed_count,url,default_profile,...,description_90,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99


In [39]:
test[test.id==1]

Unnamed: 0,id,name,screen_name,statuses_count,followers_count,friends_count,favourites_count,listed_count,url,default_profile,...,description_91,description_92,description_93,description_94,description_95,description_96,description_97,description_98,description_99,has_face
