In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

import optuna
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, make_scorer

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE




  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
with tqdm() as bar:
    # do not skip any of the rows, but update the progress bar instead
    df = pd.read_csv('data/train.csv', skiprows=lambda x: bar.update(1) and False)

df.head()

26207it [00:00, 80351.44it/s] 


Unnamed: 0,created_at,default_profile,default_profile_image,description,favourites_count,followers_count,friends_count,geo_enabled,id,lang,location,profile_background_image_url,profile_image_url,screen_name,statuses_count,verified,average_tweets_per_day,account_age_days,target
0,2012-01-15 23:40:09,True,False,Cosplayer/Fitness lover. Come to me https://t....,74,7,0,False,465096524,en,unknown,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/9666745212...,reml5477,20,False,0.006,3138,1
1,2016-10-04 00:44:39,False,False,pobody’s nerfect,50443,164,590,True,783105517673648132,cy,she/her,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1281752126...,kinlibra,6469,False,4.572,1415,0
2,2009-05-23 04:04:13,False,False,gracias por participar 🏅,9394,208,189,False,41970759,es,La diaspora,http://abs.twimg.com/images/themes/theme17/bg.gif,http://pbs.twimg.com/profile_images/1233811596...,_delaualau,30296,False,7.378,4106,0
3,2009-05-17 04:31:31,False,False,Stand Up Comedian/Actor from North Philadelphi...,46,66180,1090,True,40607946,en,"Calabasas, CA",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1184851104...,SpankHorton,164957,False,40.116,4112,0
4,2009-02-16 13:11:21,True,False,Assignment Editor at NBC10 and President of Ja...,1223,487,867,True,20983433,en,"Jenkintown, PA",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/5234863934...,javelinjt,1752,False,0.417,4201,0


## Train/Val/Test Split

The proportion of Train/Val/Test Split is:
<ul>
    <li>
        Train: 70%
    </li>
    <li>
        Val: 20%
    </li>
    <li>
        Test: 10%
    </li>
</ul>

In [3]:
X = df.drop('target', axis = 1) 
y = df['target']

In [4]:
# Initial split: 80% training, 20% validation & test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# Second split: 10% validation, 10% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1, stratify=y_temp)

## Handling Missing Values and Type Conversion

In [5]:
df.isnull().sum()

created_at                         0
default_profile                    0
default_profile_image              0
description                     5091
favourites_count                   0
followers_count                    0
friends_count                      0
geo_enabled                        0
id                                 0
lang                            5588
location                           2
profile_background_image_url    3235
profile_image_url                  1
screen_name                        0
statuses_count                     0
verified                           0
average_tweets_per_day             0
account_age_days                   0
target                             0
dtype: int64

In [6]:
# === function definitions ===
def handle_missing_values(X): # to handle the missing values
    cols = ['description', 'lang', 'location', 'profile_background_image_url', 'profile_image_url']
    values = ['', 'unknown', 'unknown', 'unknown', X['profile_image_url'].mode()[0]]

    for i in range(len(cols)):
        X[ cols[i] ] = X[ cols[i] ].fillna( values[i] )

def type_conversion(X):
    # to bool
    cols = ['default_profile','default_profile_image', 'geo_enabled', 'verified']
    for col in cols:
        X[col] = X[col].astype(bool)

    # to int
    cols = ['favourites_count','followers_count','friends_count', 'statuses_count', 'account_age_days']
    for col in cols:
        X[col] = X[col].astype(int)

    # to string
    cols = ['description', 'location','profile_background_image_url','profile_image_url','screen_name']
    for col in cols:
        X[col] = X[col].astype(str)

    # to category
    cols = ['lang'] 
    for col in cols:
        X[col] = X[col].astype('category')

    # to float
    cols = ['average_tweets_per_day']
    for col in cols:
        X[col] = X[col].astype(float)

In [7]:
# handle missing values
handle_missing_values(X_train)
handle_missing_values(X_val)

# type conversion
type_conversion(X_train)
type_conversion(X_val)

In [8]:
X_train.isnull().sum()

created_at                      0
default_profile                 0
default_profile_image           0
description                     0
favourites_count                0
followers_count                 0
friends_count                   0
geo_enabled                     0
id                              0
lang                            0
location                        0
profile_background_image_url    0
profile_image_url               0
screen_name                     0
statuses_count                  0
verified                        0
average_tweets_per_day          0
account_age_days                0
dtype: int64

In [9]:
X_val.isnull().sum()

created_at                      0
default_profile                 0
default_profile_image           0
description                     0
favourites_count                0
followers_count                 0
friends_count                   0
geo_enabled                     0
id                              0
lang                            0
location                        0
profile_background_image_url    0
profile_image_url               0
screen_name                     0
statuses_count                  0
verified                        0
average_tweets_per_day          0
account_age_days                0
dtype: int64

## Feature Engineering

In [10]:
# ==== function definitions ====
# log transform skewed data
def normalize(X):
    cols = ['favourites_count', 'followers_count', 'friends_count', 'statuses_count', 'average_tweets_per_day']

    for col in cols:
        X[col] = np.log(X[col] + 1)

# feature cross
def feature_cross(X):
    # favourites_count, followers_count, friends_count, statuses_count, average_tweets_per_day,account_age_days

    X['favourites_per_day'] = X['favourites_count']/X['account_age_days']
    X['followers_per_day'] = X['followers_count']/X['account_age_days']
    X['friends_per_day'] = X['friends_count']/X['account_age_days']

    X['followers_to_friends_ratio'] = X['followers_count']/(X['friends_count']+1)

# bucketize
def bucketize(X):
    # lang
    langs = ['en', 'unknown', 'es', 'ar']
    X['lang_bucket'] = X['lang'].apply(lambda x: x if x in langs else 'others')

    # location
    location_counts = X['location'].value_counts()
    low_val_locations = location_counts[location_counts == 1].index # bucketize when location count == 1
    X['location_bucket'] = X['location'].apply(lambda x: 'others' if x in low_val_locations else x)

# one hot
def one_hot(X):
    X = X.copy()  # avoid modifying original
    
    # one-hot for lang_bucket
    lang_dummies = pd.get_dummies(X['lang_bucket'], prefix='lang')

    
    X['hasNoBackgroundImg'] = (X['profile_background_image_url'] == 'unknown').astype(bool)
    X['isDefaultPfp'] = (X['profile_image_url'] == 'http://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png').astype(bool)
    X['hasBotInScreenName'] = X['screen_name'].str.contains(r'bot', case=False, na=False)

    # location
    X['isLocationUnknown'] = (X['location_bucket'] == 'unknown').astype(bool)
    X['isLocationOthers'] = (X['location_bucket'] == 'others').astype(bool)

    # description
    X['hasHTTP'] = (X['description'].str.contains(r'http', case=False, na=False)).astype(bool)
    # Count number of '#' in each description
    X['num_hashtags'] = X['description'].str.count('#')
    # Count number of 'http' (i.e., links)
    X['num_links'] = X['description'].str.count('http')
    

    
    X = pd.concat([X, lang_dummies], axis=1)
    
    return X

# drop columns that are not needed for predictions
def drop_cols(X):
    cols = ['created_at', 'id', 
            'description', 'lang', 'location', 'profile_background_image_url', 'profile_image_url', 'screen_name', 'lang_bucket', 'location_bucket']
    return X.drop(cols, axis = 1)

In [11]:
# normalize
normalize(X_train)
normalize(X_val)

# feature cross
feature_cross(X_train)
feature_cross(X_val)

# bucketize
bucketize(X_train)
bucketize(X_val)

# one hot
X_train = one_hot(X_train)
X_val = one_hot(X_val)

# drop columns
X_train = drop_cols(X_train)
X_val = drop_cols(X_val)

In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20964 entries, 12800 to 10106
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   default_profile             20964 non-null  bool   
 1   default_profile_image       20964 non-null  bool   
 2   favourites_count            20964 non-null  float64
 3   followers_count             20964 non-null  float64
 4   friends_count               20964 non-null  float64
 5   geo_enabled                 20964 non-null  bool   
 6   statuses_count              20964 non-null  float64
 7   verified                    20964 non-null  bool   
 8   average_tweets_per_day      20964 non-null  float64
 9   account_age_days            20964 non-null  int64  
 10  favourites_per_day          20964 non-null  float64
 11  followers_per_day           20964 non-null  float64
 12  friends_per_day             20964 non-null  float64
 13  followers_to_friends_ratio  2096

## Model Training

### Logistic Regression

In [13]:
# Only show warning
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective_lr_smote(trial):
    C = trial.suggest_float('C', 1e-4, 10, log=True)
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=1)),
        ('model', LogisticRegression(C=C, solver=solver, max_iter=5000))
    ])

    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='precision').mean()
    return score

study_lr = optuna.create_study(direction='maximize')
study_lr.optimize(objective_lr_smote, n_trials=50)

print("Best hyperparameters:", study_lr.best_params)

Best hyperparameters: {'C': 0.00016502723211896786, 'solver': 'saga'}


In [14]:
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=1)),
    ('model', LogisticRegression(
        C=study_lr.best_params['C'],
        solver=study_lr.best_params['solver'],
        max_iter=5000
    ))
])

lr_pipeline.fit(X_train, y_train)

In [15]:
y_pred = lr_pipeline.predict(X_train)

In [16]:
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[11071  2867]
 [ 1510  5516]]
              precision    recall  f1-score   support

           0       0.88      0.79      0.83     13938
           1       0.66      0.79      0.72      7026

    accuracy                           0.79     20964
   macro avg       0.77      0.79      0.78     20964
weighted avg       0.81      0.79      0.80     20964



In [17]:
y_pred = lr_pipeline.predict(X_val)

In [18]:
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[1390  353]
 [ 192  686]]
              precision    recall  f1-score   support

           0       0.88      0.80      0.84      1743
           1       0.66      0.78      0.72       878

    accuracy                           0.79      2621
   macro avg       0.77      0.79      0.78      2621
weighted avg       0.81      0.79      0.80      2621



In [19]:
coefficients = lr_pipeline.named_steps['model'].coef_[0]

feature_names = X_train.columns
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': coefficients
}).sort_values(by='importance', ascending=False)

feature_importance

Unnamed: 0,feature,importance
26,lang_unknown,0.128214
0,default_profile,0.126363
19,hasHTTP,0.087085
16,hasBotInScreenName,0.075068
17,isLocationUnknown,0.061174
22,lang_ar,0.060751
21,num_links,0.05073
8,average_tweets_per_day,0.047559
13,followers_to_friends_ratio,0.031418
24,lang_es,0.014231


## XGBoost


In [20]:
# Only show warning
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective_xgb_smote(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1.0, log=True),
    }

    model = XGBClassifier(**params, random_state = 1, n_jobs = -1, eval_metric= 'logloss')

    # Full pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=1)),
        ('clf', model)
    ])

    # Cross-validation
    # cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='precision').mean()
    
    return score

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb_smote, n_trials=50)

print("Best hyperparameters:", study_xgb.best_params)

Best hyperparameters: {'n_estimators': 429, 'max_depth': 10, 'learning_rate': 0.035165864265865765, 'subsample': 0.8981262168149493, 'colsample_bytree': 0.866118877472432, 'reg_alpha': 0.04163759855920998, 'reg_lambda': 0.001326971181234592}


In [27]:
xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=1)),
    ('model', 
        XGBClassifier(**study_xgb.best_params, random_state = 1, n_jobs = -1, eval_metric= 'logloss')
    )
])

xgb_pipeline.fit(X_train, y_train)
y_pred = xgb_pipeline.predict(X_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[13860    78]
 [   57  6969]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     13938
           1       0.99      0.99      0.99      7026

    accuracy                           0.99     20964
   macro avg       0.99      0.99      0.99     20964
weighted avg       0.99      0.99      0.99     20964



In [28]:
y_pred = xgb_pipeline.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[1594  149]
 [ 149  729]]
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1743
           1       0.83      0.83      0.83       878

    accuracy                           0.89      2621
   macro avg       0.87      0.87      0.87      2621
weighted avg       0.89      0.89      0.89      2621



## LightGBM

In [21]:
# Only show warning
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective_lgbm_smote(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1.0, log=True),
    }

    model = LGBMClassifier(**params, random_state = 1, n_jobs = -1, verbosity=-1)

    # Full pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=1)),
        ('clf', model)
    ])

    # Cross-validation
    # cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='precision').mean()
    
    return score

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm_smote, n_trials=50)

print("Best hyperparameters:", study_lgbm.best_params)



Best hyperparameters: {'n_estimators': 414, 'learning_rate': 0.06083050171609545, 'num_leaves': 146, 'max_depth': 13, 'subsample': 0.534933839405757, 'colsample_bytree': 0.7273280452465938, 'reg_alpha': 0.0004990732124121463, 'reg_lambda': 0.006471481336376972}




In [25]:
lgbm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=1)),
    ('model', 
         LGBMClassifier(**study_lgbm.best_params, random_state = 1, n_jobs = -1, verbosity=-1)
    )
])

lgbm_pipeline.fit(X_train, y_train)
y_pred = lgbm_pipeline.predict(X_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))



[[13927    11]
 [    3  7023]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13938
           1       1.00      1.00      1.00      7026

    accuracy                           1.00     20964
   macro avg       1.00      1.00      1.00     20964
weighted avg       1.00      1.00      1.00     20964



In [26]:
y_pred = lgbm_pipeline.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[1587  156]
 [ 157  721]]
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1743
           1       0.82      0.82      0.82       878

    accuracy                           0.88      2621
   macro avg       0.87      0.87      0.87      2621
weighted avg       0.88      0.88      0.88      2621





## Random Forest

In [22]:
# Only show warning
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective_rf_smote(trial):

    params = {
        'n_estimators':trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
    }

    model = RandomForestClassifier(**params, random_state = 1, n_jobs = -1)

    # Full pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=1)),
        ('clf', model)
    ])

    # Cross-validation
    # cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='precision').mean()
    
    return score

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf_smote, n_trials=50)

print("Best hyperparameters:", study_rf.best_params)

Best hyperparameters: {'n_estimators': 335, 'max_depth': 26, 'max_features': 'log2'}


In [23]:
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=1)),
    ('model', 
        RandomForestClassifier(**study_rf.best_params, random_state = 1, n_jobs = -1)
    )
])

rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

[[13938     0]
 [    0  7026]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13938
           1       1.00      1.00      1.00      7026

    accuracy                           1.00     20964
   macro avg       1.00      1.00      1.00     20964
weighted avg       1.00      1.00      1.00     20964



In [24]:
y_pred = rf_pipeline.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

[[1588  155]
 [ 161  717]]
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1743
           1       0.82      0.82      0.82       878

    accuracy                           0.88      2621
   macro avg       0.87      0.86      0.86      2621
weighted avg       0.88      0.88      0.88      2621

