In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split

## Load Data

In [2]:
with tqdm() as bar:
    # do not skip any of the rows, but update the progress bar instead
    df = pd.read_csv('data/train.csv', skiprows=lambda x: bar.update(1) and False)

df.head()

26207it [00:00, 88959.42it/s] 


Unnamed: 0,created_at,default_profile,default_profile_image,description,favourites_count,followers_count,friends_count,geo_enabled,id,lang,location,profile_background_image_url,profile_image_url,screen_name,statuses_count,verified,average_tweets_per_day,account_age_days,target
0,2012-01-15 23:40:09,True,False,Cosplayer/Fitness lover. Come to me https://t....,74,7,0,False,465096524,en,unknown,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/9666745212...,reml5477,20,False,0.006,3138,1
1,2016-10-04 00:44:39,False,False,pobody’s nerfect,50443,164,590,True,783105517673648132,cy,she/her,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1281752126...,kinlibra,6469,False,4.572,1415,0
2,2009-05-23 04:04:13,False,False,gracias por participar 🏅,9394,208,189,False,41970759,es,La diaspora,http://abs.twimg.com/images/themes/theme17/bg.gif,http://pbs.twimg.com/profile_images/1233811596...,_delaualau,30296,False,7.378,4106,0
3,2009-05-17 04:31:31,False,False,Stand Up Comedian/Actor from North Philadelphi...,46,66180,1090,True,40607946,en,"Calabasas, CA",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1184851104...,SpankHorton,164957,False,40.116,4112,0
4,2009-02-16 13:11:21,True,False,Assignment Editor at NBC10 and President of Ja...,1223,487,867,True,20983433,en,"Jenkintown, PA",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/5234863934...,javelinjt,1752,False,0.417,4201,0


## Train/Val/Test Split

The proportion of Train/Val/Test Split is:
<ul>
    <li>
        Train: 70%
    </li>
    <li>
        Val: 20%
    </li>
    <li>
        Test: 10%
    </li>
</ul>

In [3]:
X = df.drop('target', axis = 1) 
y = df['target']

In [4]:
# Initial split: 80% training, 20% validation & test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# Second split: 10% validation, 10% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1, stratify=y_temp)

## Handling Missing Values and Type Conversion

In [5]:
df.isnull().sum()

created_at                         0
default_profile                    0
default_profile_image              0
description                     5091
favourites_count                   0
followers_count                    0
friends_count                      0
geo_enabled                        0
id                                 0
lang                            5588
location                           2
profile_background_image_url    3235
profile_image_url                  1
screen_name                        0
statuses_count                     0
verified                           0
average_tweets_per_day             0
account_age_days                   0
target                             0
dtype: int64

In [6]:
# === function definitions ===
def handle_missing_values(X): # to handle the missing values
    cols = ['description', 'lang', 'location', 'profile_background_image_url', 'profile_image_url']
    values = ['', 'unknown', 'unknown', 'unknown', X['profile_image_url'].mode()[0]]

    for i in range(len(cols)):
        X[ cols[i] ] = X[ cols[i] ].fillna( values[i] )

# drop columns that are not needed for predictions
def drop_cols(X):
    cols = ['created_at', 'id']
    return X.drop(cols, axis = 1)

def type_conversion(X):
    # to bool
    cols = ['default_profile','default_profile_image', 'geo_enabled', 'verified']
    for col in cols:
        X[col] = X[col].astype(bool)

    # to int
    cols = ['favourites_count','followers_count','friends_count', 'statuses_count', 'account_age_days']
    for col in cols:
        X[col] = X[col].astype(int)

    # to string
    cols = ['description', 'location','profile_background_image_url','profile_image_url','screen_name']
    for col in cols:
        X[col] = X[col].astype(str)

    # to category
    cols = ['lang'] 
    for col in cols:
        X[col] = X[col].astype('category')

    # to float
    cols = ['average_tweets_per_day']
    for col in cols:
        X[col] = X[col].astype(float)

In [7]:
# handle missing values
handle_missing_values(X_train)
handle_missing_values(X_val)

# drop columns
X_train = drop_cols(X_train)
X_val = drop_cols(X_val)

# type conversion
type_conversion(X_train)
type_conversion(X_val)

In [8]:
X_train.isnull().sum()

default_profile                 0
default_profile_image           0
description                     0
favourites_count                0
followers_count                 0
friends_count                   0
geo_enabled                     0
lang                            0
location                        0
profile_background_image_url    0
profile_image_url               0
screen_name                     0
statuses_count                  0
verified                        0
average_tweets_per_day          0
account_age_days                0
dtype: int64

In [9]:
X_val.isnull().sum()

default_profile                 0
default_profile_image           0
description                     0
favourites_count                0
followers_count                 0
friends_count                   0
geo_enabled                     0
lang                            0
location                        0
profile_background_image_url    0
profile_image_url               0
screen_name                     0
statuses_count                  0
verified                        0
average_tweets_per_day          0
account_age_days                0
dtype: int64

## Feature Engineering

In [10]:
# ==== function definitions ====
# log transform skewed data
def normalize(X):
    cols = ['favourites_count', 'followers_count', 'friends_count', 'statuses_count', 'average_tweets_per_day']

    for col in cols:
        X[col] = np.log(X[col] + 1)