`demographic.csv` data is a user level data containing all demographic-related data, Facebook data and several phone-specific data (aggregated to
user level).

All variables with prefix: ‘de’ comes from user’s self reported data, prefix: ‘fb’ from user’s Facebook profile and ‘ph’ from user’s phone data.

Variables definition:
- index : unique identifier for each applicant
- flag_bad : binary dependent variable, 1 represents a bad user while 0 represents a good user.
- date_joined : timestamp when user apply
- de_gender : 1 = male; 2 = female
- de_employment_type : 1 = full time; 2 = part-time ; 3 = business owner
- de_education : 1 = elementary school; 2 = senior high school ; 3 = diploma ; 4 = undergraduate ; 5 = postgraduate
- de_marital_status : 1 = single; 2 = married; 3 = divorced; 4 = widow/widower
- fb_last_updated_date : timestamp when user updated his/her FB profile
- The rest of the variables are self-explanatory.

**Task:** Create a predictive model to predict whether the user is good or bad

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data cleaning

In [None]:
df= pd.read_csv('demographic.csv')
df.head()

In [None]:
df.info()

In [None]:
# adjust data type
df['fb_dob'] = pd.to_datetime(df['fb_dob'])
df['fb_last_updated_date'] = pd.to_datetime(df['fb_last_updated_date']).dt.tz_convert('utc')
df['de_date_joined'] = pd.to_datetime(df['de_date_joined'],dayfirst=True).dt.tz_localize('utc')

In [None]:
# months since joined
df['de_months_since_joined'] = df['fb_last_updated_date'].max() - df['de_date_joined']
df['de_months_since_joined'] = df['de_months_since_joined'].dt.days/30

In [None]:
# months_since_fb_update
df['fb_months_since_update'] = df['fb_last_updated_date'].max() - df['fb_last_updated_date']
df['fb_months_since_update'] = df['fb_months_since_update'].dt.days/30
df['fb_months_since_update'] = df['fb_months_since_update'].fillna(0)

In [None]:
df = df.drop(columns=['fb_last_updated_date','de_date_joined'])

In [None]:
# check misssing value
df.isna().sum()

In [None]:
# Extract ph_call_log_stats column
mis_values = '{"percent_incoming_nighttime": 0, "percent_outgoing_daytime": 0, "duration_incoming_daytime": 0, "duration_outgoing_daytime": 0, "percent_incoming_daytime": 0, "percent_other_calls": 0, "duration_outgoing_nighttime": 0, "percent_outgoing_nighttime": 0, "total_calls": 0, "duration_incoming_nighttime": 0}'
df['ph_call_log_stats'] = df['ph_call_log_stats'].fillna(mis_values)

percent_incoming_nighttime = []
percent_outgoing_nighttime = []
percent_outgoing_daytime = []
percent_incoming_daytime = []
percent_other_calls = []
total_calls = []

for item in df['ph_call_log_stats']:
    item = ast.literal_eval(item)
    percent_incoming_nighttime.append(item['percent_incoming_nighttime'])
    percent_outgoing_nighttime.append(item['percent_outgoing_nighttime'])
    percent_outgoing_daytime.append(item['percent_outgoing_daytime'])
    percent_incoming_daytime.append(item['percent_incoming_daytime'])
    percent_other_calls.append(item['percent_other_calls'])
    total_calls.append(item['total_calls'])

df['ph_percent_incoming_nighttime'] = percent_incoming_nighttime
df['ph_percent_outgoing_nighttime'] = percent_outgoing_nighttime
df['ph_percent_outgoing_daytime'] = percent_outgoing_daytime
df['ph_percent_incoming_daytime'] = percent_incoming_daytime
df['ph_percent_other_calls'] = percent_other_calls
df['ph_total_calls'] = total_calls

df = df.drop(columns = ['ph_call_log_stats'])

In [None]:
#
df['ph_country_code'] = df['ph_country_code'].fillna('id')
df['ph_country_code'] = df['ph_country_code'].replace(['us', 'my', 'sg', 'tw', 'jp', 'kh', 'th'], 'others')

In [None]:
# Extract device column
import ast

brand = []
for item in df['ph_other_device_info']:
    item = ast.literal_eval(item)
    brand.append(item['brand'].lower())
df['ph_device_brand'] = brand

df = df.drop(columns = ['ph_other_device_info'])

In [None]:
df['fb_gender'] = df['fb_gender'].replace(['male','female'], [1, 2])

In [None]:
# calculate fb_age from fb_dob
df['fb_age'] = 2017 - df['fb_dob'].dt.year
df = df.drop(columns = ['fb_dob'])

In [None]:
df = df.drop(columns =['index','de_accomodation_type'])

In [None]:
cat_features = ['flag_bad','de_gender','de_employment_type','de_education','de_marital_status','ph_country_code','fb_gender','fb_relation','ph_device_brand']
df[cat_features] = df[cat_features].astype('category')

In [None]:
import seaborn as sns

sns.heatmap(df.corr(), cmap="coolwarm")

# Model training

In [None]:
X = df.drop(columns = ['flag_bad'])
y = df.loc[:,'flag_bad']

In [None]:
X.drop(columns = ['ph_app_list','fb_relation'], inplace=True)

In [None]:
other_devices = X['ph_device_brand'].value_counts()[6:].index.tolist()
X['ph_device_brand'] = X['ph_device_brand'].replace(other_devices, 'others')

In [None]:
X['fb_age'].fillna(X['de_age'], inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

num_features = df.select_dtypes('number').columns

X[num_features] = StandardScaler().fit_transform(X[num_features])

In [None]:
# Apply one-hot encoding
one_hot_encoded = pd.get_dummies(X[['de_gender','de_employment_type','ph_country_code','ph_device_brand','fb_gender']])
X = pd.concat([X, one_hot_encoded], axis=1)
X = X.drop(columns = ['de_gender','de_employment_type', 'ph_country_code','ph_device_brand','fb_gender'])

In [None]:
X.isna().sum()

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
x_train.shape

## XG Boost

In [None]:
# from sklearn
import xgboost as xgb

learning_rate_range = np.arange(0.01, 2, 0.05)
test_XG = []
train_XG = []
for lr in learning_rate_range:
    xgb_classifier = xgb.XGBClassifier(eta = lr,enable_categorical=True)
    xgb_classifier.fit(x_train, y_train)
    train_XG.append(xgb_classifier.score(x_train, y_train))
    test_XG.append(xgb_classifier.score(x_val, y_val))

In [None]:
fig = plt.figure(figsize=(20, 7))
plt.plot(learning_rate_range, train_XG, c='orange', label='Train')
plt.plot(learning_rate_range, test_XG, c='m', label='Test')
plt.xlabel('Learning rate')
plt.xticks(learning_rate_range)
plt.ylabel('Accuracy score')
plt.ylim(0.6, 1)
plt.legend(prop={'size': 12}, loc=3)
plt.title('Accuracy score vs. Learning rate of XGBoost', size=14)
plt.show()

In [None]:
xgb_classifier = xgb.XGBClassifier(eta =1.8,enable_categorical=True)
xgb_classifier.fit(x_train, y_train)
y_pred_xgb = xgb_classifier.predict(x_val)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, roc_curve

print(confusion_matrix(y_val, y_pred_xgb))
print(classification_report(y_val, y_pred_xgb))

In [None]:
f1_xgb = f1_score(y_val, y_pred_xgb, average='weighted')
recall_xgb = recall_score(y_val, y_pred_xgb)
f1_xgb, recall_xgb

In [None]:
auc_xgb = roc_auc_score(y_val, y_pred_xgb)
false_positive_rate, true_positive_rate, thresolds = roc_curve(y_val, y_pred_xgb)

plt.figure(figsize=(8, 8), dpi=100)
plt.axis('scaled')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title("AUC & ROC Curve")
plt.plot(false_positive_rate, true_positive_rate, 'g')
plt.fill_between(false_positive_rate, true_positive_rate, facecolor='lightgreen', alpha=0.7)
plt.text(0.95, 0.05, 'AUC = %0.4f' % auc_xgb, ha='right', fontsize=12, weight='bold', color='blue')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

## LightGBM

In [None]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier()
lgb.fit(x_train, y_train)
y_pred_lgb = lgb.predict(x_val)

In [None]:
# features importance

tmp = pd.DataFrame({'Feature': x_train.columns, 'Feature importance': lgb.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

In [None]:
print(confusion_matrix(y_val, y_pred_lgb))
print(classification_report(y_val, y_pred_lgb))

In [None]:
f1_lgb = f1_score(y_val, y_pred_lgb, average='weighted')
recall_lgb = recall_score(y_val, y_pred_lgb)
f1_lgb, recall_lgb

In [None]:
auc_lgb = roc_auc_score(y_val, y_pred_lgb)
false_positive_rate, true_positive_rate, thresolds = roc_curve(y_val, y_pred_lgb)

plt.figure(figsize=(8, 8), dpi=100)
plt.axis('scaled')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title("AUC & ROC Curve")
plt.plot(false_positive_rate, true_positive_rate, 'g')
plt.fill_between(false_positive_rate, true_positive_rate, facecolor='lightgreen', alpha=0.7)
plt.text(0.95, 0.05, 'AUC = %0.4f' % auc_lgb, ha='right', fontsize=12, weight='bold', color='blue')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

# Upsampling

In [None]:
# Apply SMOTE
from imblearn.over_sampling import SMOTE, SMOTENC

smoten = SMOTENC(sampling_strategy='auto', random_state=42,k_neighbors=2, categorical_features=[4,5])
X_resampled, y_resampled = smoten.fit_resample(x_train, y_train)

# # Check the class distribution after applying SMOTE
unique, counts = np.unique(y_resampled, return_counts=True)
print(dict(zip(unique, counts)))

## XG Boost

In [None]:
# from sklearn.utils import class_weight
learning_rate_range = np.arange(0.01, 2, 0.05)
test_XG = []
train_XG = []
for lr in learning_rate_range:
    xgb_classifier = xgb.XGBClassifier(eta = lr,enable_categorical=True)
    xgb_classifier.fit(X_resampled, y_resampled)
    train_XG.append(xgb_classifier.score(X_resampled, y_resampled))
    test_XG.append(xgb_classifier.score(x_val, y_val))

In [None]:
fig = plt.figure(figsize=(20, 7))
plt.plot(learning_rate_range, train_XG, c='orange', label='Train')
plt.plot(learning_rate_range, test_XG, c='m', label='Test')
plt.xlabel('Learning rate')
plt.xticks(learning_rate_range)
plt.ylabel('Accuracy score')
plt.ylim(0.6, 1)
plt.legend(prop={'size': 12}, loc=3)
plt.title('Accuracy score vs. Learning rate of XGBoost', size=14)
plt.show()

In [None]:
xgb_classifier = xgb.XGBClassifier(eta =1.8,enable_categorical=True)
xgb_classifier.fit(X_resampled, y_resampled)
y_pred_xgb_up = xgb_classifier.predict(x_val)

In [None]:
print(confusion_matrix(y_val, y_pred_xgb_up))
print(classification_report(y_val, y_pred_xgb_up))

In [None]:
f1_xgb_up = f1_score(y_val, y_pred_xgb_up, average='weighted')
recall_xgb_up = recall_score(y_val, y_pred_xgb_up)
f1_xgb_up, recall_xgb_up

In [None]:
auc_xgb_up = roc_auc_score(y_val, y_pred_xgb_up)
false_positive_rate, true_positive_rate, thresolds = roc_curve(y_val, y_pred_xgb_up)

plt.figure(figsize=(8, 8), dpi=100)
plt.axis('scaled')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title("AUC & ROC Curve")
plt.plot(false_positive_rate, true_positive_rate, 'g')
plt.fill_between(false_positive_rate, true_positive_rate, facecolor='lightgreen', alpha=0.7)
plt.text(0.95, 0.05, 'AUC = %0.4f' % auc_xgb_up, ha='right', fontsize=12, weight='bold', color='blue')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

## LightGBM

In [None]:
lgb = LGBMClassifier()
lgb.fit(X_resampled, y_resampled)
y_pred_lgb_up = lgb.predict(x_val)

In [None]:
tmp = pd.DataFrame({'Feature': x_train.columns, 'Feature importance': lgb.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

In [None]:
print(confusion_matrix(y_val, y_pred_lgb_up))
print(classification_report(y_val, y_pred_lgb_up))

In [None]:
f1_lgb_up = f1_score(y_val, y_pred_lgb_up, average='weighted')
recall_lgb_up = recall_score(y_val, y_pred_lgb_up)
f1_lgb_up, recall_lgb_up

In [None]:
auc_lgb_up = roc_auc_score(y_val, y_pred_lgb_up)
false_positive_rate, true_positive_rate, thresolds = roc_curve(y_val, y_pred_lgb_up)

plt.figure(figsize=(8, 8), dpi=100)
plt.axis('scaled')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title("AUC & ROC Curve")
plt.plot(false_positive_rate, true_positive_rate, 'g')
plt.fill_between(false_positive_rate, true_positive_rate, facecolor='lightgreen', alpha=0.7)
plt.text(0.95, 0.05, 'AUC = %0.4f' % auc_lgb_up, ha='right', fontsize=12, weight='bold', color='blue')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

# Down sampling

In [None]:
df_down = pd.concat([x_train, y_train], axis=1)
df_down.info()

In [None]:
good = df_down[df_down['flag_bad']==0]
bad = df_down[df_down['flag_bad']==1]
n_sample = bad.shape[0]
sample = good.sample(n=int(n_sample), random_state=42)

In [None]:
not_in_sample = good[~good.isin(sample)].dropna()
not_in_sample.shape

In [None]:
new_data = pd.concat([bad, sample], axis=0)

In [None]:
x_und = new_data.drop(columns=['flag_bad'])
y_und = new_data['flag_bad']

In [None]:
val = pd.concat([x_val,y_val], axis=1)
val = pd.concat([val,not_in_sample], axis=0)
x_val = val.drop(columns=['flag_bad'])
y_val = val['flag_bad']

In [None]:
x_train.shape, y_train.shape

In [None]:
x_val.shape, y_val.shape

## XG Boost

In [None]:
# from sklearn.utils import class_weight
learning_rate_range = np.arange(0.01, 2, 0.05)
test_XG = []
train_XG = []
for lr in learning_rate_range:
    xgb_classifier = xgb.XGBClassifier(eta = lr,enable_categorical=True)
    xgb_classifier.fit(x_und, y_und)
    train_XG.append(xgb_classifier.score(x_und, y_und))
    test_XG.append(xgb_classifier.score(x_val, y_val))

In [None]:
fig = plt.figure(figsize=(20, 7))
plt.plot(learning_rate_range, train_XG, c='orange', label='Train')
plt.plot(learning_rate_range, test_XG, c='m', label='Test')
plt.xlabel('Learning rate')
plt.xticks(learning_rate_range)
plt.ylabel('Accuracy score')
plt.ylim(0.6, 1)
plt.legend(prop={'size': 12}, loc=3)
plt.title('Accuracy score vs. Learning rate of XGBoost', size=14)
plt.show()

In [None]:
xgb_classifier = xgb.XGBClassifier(eta =0.05,enable_categorical=True)
xgb_classifier.fit(x_und, y_und)
y_pred_xgb_down = xgb_classifier.predict(x_val)

In [None]:
print(confusion_matrix(y_val, y_pred_xgb_down))
print(classification_report(y_val, y_pred_xgb_down))

In [None]:
f1_xgb_down = f1_score(y_val, y_pred_xgb_down, average='weighted')
recall_xgb_down = recall_score(y_val, y_pred_xgb_down)
f1_xgb_down, recall_xgb_down

In [None]:
auc_xgb_down = roc_auc_score(y_val, y_pred_xgb_down)
false_positive_rate, true_positive_rate, thresolds = roc_curve(y_val, y_pred_xgb_down)

plt.figure(figsize=(8, 8), dpi=100)
plt.axis('scaled')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title("AUC & ROC Curve")
plt.plot(false_positive_rate, true_positive_rate, 'g')
plt.fill_between(false_positive_rate, true_positive_rate, facecolor='lightgreen', alpha=0.7)
plt.text(0.95, 0.05, 'AUC = %0.4f' % auc_xgb_down, ha='right', fontsize=12, weight='bold', color='blue')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

## LightGBM

In [None]:
lgb = LGBMClassifier()
lgb.fit(x_und, y_und)
y_pred_lgb_down = lgb.predict(x_val)

In [None]:
tmp = pd.DataFrame({'Feature': x_und.columns, 'Feature importance': lgb.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()

In [None]:
print(confusion_matrix(y_val, y_pred_lgb_down))
print(classification_report(y_val, y_pred_lgb_down))

In [None]:
f1_lgb_down = f1_score(y_val, y_pred_lgb_down, average='weighted')
recall_lgb_down = recall_score(y_val, y_pred_lgb_down)
f1_lgb_down, recall_lgb_down

In [None]:
auc_lgb_down = roc_auc_score(y_val, y_pred_lgb_down)
false_positive_rate, true_positive_rate, thresolds = roc_curve(y_val, y_pred_lgb_down)

plt.figure(figsize=(8, 8), dpi=100)
plt.axis('scaled')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title("AUC & ROC Curve")
plt.plot(false_positive_rate, true_positive_rate, 'g')
plt.fill_between(false_positive_rate, true_positive_rate, facecolor='lightgreen', alpha=0.7)
plt.text(0.95, 0.05, 'AUC = %0.4f' % auc_lgb_down, ha='right', fontsize=12, weight='bold', color='blue')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

# Conclusions