In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import OrderedDict

In [11]:
from datetime import datetime
from datetime import date

In [3]:
raw_df = pd.read_csv('./data/twitter_normal_user_data.csv', encoding = "ISO-8859-1")
# raw_df

# Scheme 1:

In [43]:
df1 = raw_df.copy()

In [84]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender:confidence      20000 non-null  float64
 1   profile_yn             20000 non-null  int64  
 2   profile_yn:confidence  20000 non-null  float64
 3   description            20000 non-null  object 
 4   fav_number             20000 non-null  int64  
 5   link_color             20000 non-null  object 
 6   name                   20000 non-null  object 
 7   retweet_count          20000 non-null  int64  
 8   sidebar_color          20000 non-null  object 
 9   text                   20000 non-null  object 
 10  tweet_count            20000 non-null  int64  
 11  account_uptime         20000 non-null  int64  
 12  tweets_per_day         20000 non-null  float64
 13  retweets_per_day       20000 non-null  float64
 14  has_description        20000 non-null  int64  
 15  ha

- If `gender` is not recorded, fill the value with unknown
- In such case (`gender` not recorded), `gender:confidence` is 0
- Mark NULL values

In [45]:
df1['gender:confidence'].fillna(0, inplace=True)

In [46]:
df1['gender'] = df1['gender'].apply(lambda x: 'unknown' if pd.isnull(x) else x)
df1['description'] = df1['description'].apply(lambda x: '' if pd.isnull(x) else x)
df1['tweet_coord'] = df1['tweet_coord'].apply(lambda x: '' if pd.isnull(x) else x)
df1['tweet_location'] = df1['tweet_location'].apply(lambda x: '' if pd.isnull(x) else x)
df1['user_timezone'] = df1['user_timezone'].apply(lambda x: '' if pd.isnull(x) else x)

In [49]:
def split_date(ddata):
    dlist = ddata.split()
    date_data = dlist[0]
    time_data = dlist[1]

    date_data_list = list(date_data.split('/'))
    date_data_list[2] = str(int(date_data_list[2]) + 2000)

    date_data_list.extend(list(time_data.split(':')))
    date_data_list.append("00")
    
    return ' '.join(date_data_list)


In [50]:
df1['_last_judgment_at'] = df1['_last_judgment_at'].apply(split_date)
df1['created'] = df1['created'].apply(split_date)
df1['tweet_created'] = df1['tweet_created'].apply(split_date)

In [85]:
df1.head()

Unnamed: 0,gender:confidence,profile_yn,profile_yn:confidence,description,fav_number,link_color,name,retweet_count,sidebar_color,text,...,retweets_per_day,has_description,has_coord,has_location,has_timezone,favnum_per_day,gender_brand,gender_unknown,gender_male,gender_female
0,1.0,1,1.0,i sing my own rhythm.,0,08C2C2,sheezy0,0,FFFFFF,Robbie E Responds To Critics After Win Against...,...,0.0,1,0,1,1,0.0,0,0,1,0
1,1.0,1,1.0,I'm the author of novels filled with family dr...,68,0084B4,DavdBurnett,0,C0DEED,ÂÃÃIt felt like they were my friends and I ...,...,0.0,1,0,0,1,0.06066,0,0,1,0
2,0.6625,1,1.0,louis whining and squealing and all,7696,ABB8C2,lwtprettylaugh,1,C0DEED,i absolutely adore when louis starts the songs...,...,0.003003,1,0,1,1,23.111111,0,0,1,0
3,1.0,1,1.0,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe...",202,0084B4,douggarland,0,C0DEED,Hi @JordanSpieth - Looking at the url - do you...,...,0.0,1,0,1,1,0.086733,0,0,1,0
4,1.0,1,1.0,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...,37318,3B94D9,WilfordGemma,0,0,Watching Neighbours on Sky+ catching up with t...,...,0.0,1,0,0,0,66.639286,0,0,0,1


- Do not consider tweet created time (seemed irrelevant). Drop the `tweet_created` column.
- Do not consider time (hour/minute/second)
- Consider account uptime in days (last judgement time - created time)


In [51]:
def calculate_uptime(judgement_time, created_time):
    jlist = list(map(int,judgement_time.split()))
    clist = list(map(int,created_time.split()))
    delta = date(jlist[2], jlist[0], jlist[1]) - date(clist[2], clist[0], clist[1])
    # If account created, counted as 1 day
    return delta.days + 1

In [52]:
df1['account_uptime'] = df1.apply(lambda x: calculate_uptime(x['_last_judgment_at'], x['created']), axis=1)

In [53]:
df1['account_uptime'].describe()

count    20000.000000
mean      1142.731050
std        793.218684
min          1.000000
25%        443.000000
50%       1065.000000
75%       1757.000000
max       3371.000000
Name: account_uptime, dtype: float64

In [54]:
df1 = df1.drop(['_last_judgment_at', 'created', 'tweet_created'], axis=1)

In [74]:
df1.describe()

Unnamed: 0,gender:confidence,profile_yn:confidence,fav_number,retweet_count,tweet_count,account_uptime,tweets_per_day,retweets_per_day,has_description,has_coord,has_location,has_timezone,favnum_per_day
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,0.881524,0.993208,4379.82665,0.0795,38981.18,1142.73105,70.472606,0.000183,0.8131,0.00795,0.62665,0.6113,5.99825
std,0.194015,0.047225,12516.24516,2.653042,116972.0,793.218684,215.420807,0.005435,0.389841,0.08881,0.483706,0.487467,20.811268
min,0.0,0.6272,0.0,0.0,1.0,1.0,0.000552,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.677575,1.0,11.0,0.0,2396.0,443.0,3.062609,0.0,1.0,0.0,0.0,0.0,0.016292
50%,1.0,1.0,456.5,0.0,11461.0,1065.0,12.359685,0.0,1.0,0.0,1.0,1.0,0.495961
75%,1.0,1.0,3314.5,0.0,40057.75,1757.0,48.990045,0.0,1.0,0.0,1.0,1.0,3.59195
max,1.0,1.0,341621.0,330.0,2680199.0,3371.0,3106.907514,0.333333,1.0,1.0,1.0,1.0,622.264706


In [56]:
def calculate_activity_per_uptime(activity, uptime):
    return activity / uptime

In [71]:
df1['tweets_per_day'] = df1.apply(lambda x: calculate_activity_per_uptime(x['tweet_count'], x['account_uptime']), axis=1)
df1['retweets_per_day'] = df1.apply(lambda x: calculate_activity_per_uptime(x['retweet_count'], x['account_uptime']), axis=1)
df1['favnum_per_day'] = df1.apply(lambda x: calculate_activity_per_uptime(x['fav_number'], x['account_uptime']), axis=1)

In [72]:
corr_matrix = df1[['gender:confidence', 'profile_yn:confidence', 'fav_number', 'retweet_count',
                   'tweet_count', 'account_uptime', 'tweets_per_day', 'retweets_per_day',
                   'has_description', 'has_coord', 'has_location', 'has_timezone', 'favnum_per_day']].corr()

In [73]:
corr_matrix['gender:confidence'].sort_values(ascending=False)

gender:confidence        1.000000
profile_yn:confidence    0.308720
account_uptime           0.141707
has_description          0.111983
has_location             0.096848
has_timezone             0.072486
has_coord                0.027507
retweet_count           -0.004716
tweet_count             -0.012566
retweets_per_day        -0.015426
fav_number              -0.029313
favnum_per_day          -0.055421
tweets_per_day          -0.067363
Name: gender:confidence, dtype: float64

- Check columns if they have value or not (`description`, `tweet_coord`, `tweet_location`, `user_timezone`)

In [65]:
df1['has_description'] = df1['description'].apply(lambda x: 0 if x == '' else 1)
df1['has_coord'] = df1['tweet_coord'].apply(lambda x: 0 if x == '' else 1)
df1['has_location'] = df1['tweet_location'].apply(lambda x: 0 if x == '' else 1)
df1['has_timezone'] = df1['user_timezone'].apply(lambda x: 0 if x == '' else 1)

- Consider removing the three columns with no direct insights

In [67]:
df1 = df1.drop(['tweet_coord', 'tweet_location', 'user_timezone'], axis=1)

In [78]:
df1['profile_yn'] = df1['profile_yn'].apply(lambda x: 1 if x == 'yes' else 0)

In [80]:
df1['gender'].value_counts()

gender
female     6685
male       6173
brand      5928
unknown    1214
Name: count, dtype: int64

- One-hot encoding the `gender` column
- Remove `gender` afterwards

In [82]:
df1['gender_brand'] = df1['gender'].apply(lambda x: 1 if x == 'brand' else 0)
df1['gender_unknown'] = df1['gender'].apply(lambda x: 1 if x == 'unknown' else 0)
df1['gender_male'] = df1['gender'].apply(lambda x: 1 if x == 'male' else 0)
df1['gender_female'] = df1['gender'].apply(lambda x: 1 if x == 'female' else 0)

In [83]:
df1 = df1.drop(['gender'], axis=1)

In [87]:
df1.to_csv('./data/twitter_normal_user_processed_data_v1.csv')