In [1]:
#importing libraries
import numpy as np
import pandas as pd
pd.options.display.max_columns = 999

from math import sqrt
import lightgbm as lgb
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
#from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score

In [2]:
DATASET_LOCAL_PATH = "C:/Users/affiqazrin/Desktop/mmlspark/Data_FinalProject_READY4.csv"
df = pd.read_csv(DATASET_LOCAL_PATH)
    
ALL_COLS = ["age", #numerical
            "job", #categorical
            "marital", #categorical
            "education", #categorical
            "default", #categorical
            "housing", #categorical, binary
            "loan", #categorical, binary
            "contact", #categorical
            "day", #categorical
            "month", #categorical
            "duration", #numerical
            "campaign", #categorical
            "pdays", #numerical
            "previous", #numerical
            "poutcome", #categorical
            "deposit", #categorical, binary
           ]
    
NUMERICAL_COLS = ["age", #numerical
                  "duration", #numerical
                  "pdays", #numerical
                  "previous", #numerical
                 ]
    
CATEGORICAL_COLS = ["job", #categorical
                    "marital", #categorical
                    "education", #categorical
                    "default", #categorical
                    "housing", #categorical, binary
                    "loan", #categorical, binary
                    "contact", #categorical
                    "day", #categorical
                    "month", #categorical
                    "campaign", #categorical
                    "poutcome" #categorical
                   ]

TARGET_COL = ["deposit" #categorical, binary
             ]
    
le = LabelEncoder()
#TARGET_COL2 = le.fit_transform(df[TARGET_COL])
TARGET_COL2 = df[TARGET_COL].apply(LabelEncoder().fit_transform)
    
ohe = OneHotEncoder(handle_unknown='ignore')
CATEGORICAL_COLS2 = pd.DataFrame(ohe.fit_transform(df[CATEGORICAL_COLS]).toarray())
    
DATA_PROCESSED = pd.concat([df[NUMERICAL_COLS], CATEGORICAL_COLS2], axis=1)
mask = DATA_PROCESSED + TARGET_COL2
    
# Correlation_plot(model[mask], response_column)    
X = DATA_PROCESSED.values
y = TARGET_COL2.values.ravel()
    
sm = SMOTE(random_state=12)
X_resampled, y_resampled = sm.fit_sample(X, y)
    
X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                        y_resampled,
                                                        test_size = 0.3,
                                                        random_state = 0)
    
print('Size of resampled data:')
print(' train shape... ', X_train.shape, y_train.shape)
print(' test shape.... ', X_test.shape, y_test.shape)
    
DATA_PROCESSED_HEADER=list(DATA_PROCESSED.columns.values)
n_features = len(DATA_PROCESSED_HEADER)
n_ALL_COLS = len(ALL_COLS)
n_NUMERICAL_COLS = len(NUMERICAL_COLS)
n_CATEGORICAL_COLS = len(CATEGORICAL_COLS)

Size of resampled data:
 train shape...  (15219, 94) (15219,)
 test shape....  (6523, 94) (6523,)


In [3]:
#converting the dataset into proper LGB format 
d_train=lgb.Dataset(X_train, label=y_train)

In [4]:
#Specifying the parameters
params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='binary' #Binary target feature
params['metric']='binary_logloss' #metric for binary classification
params['max_depth']=10

In [5]:
#train the model 
clf=lgb.train(params,d_train,100) #train the model on 100 epocs

[LightGBM] [Info] Number of positive: 7567, number of negative: 7652
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14317
[LightGBM] [Info] Number of data points in the train set: 15219, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497207 -> initscore=-0.011170
[LightGBM] [Info] Start training from score -0.011170


In [6]:
#prediction on the test set
y_pred=clf.predict(X_test)

In [7]:
#rounding the values
y_pred=y_pred.round(0)

#converting from float to integer
y_pred=y_pred.astype(int)

In [8]:
print("ROC-AUC: ", roc_auc_score(y_pred,y_test))
print("RMSE: ", sqrt(mean_squared_error(y_pred,y_test)))
print("Precision Score: ", precision_score(y_pred,y_test))

ROC-AUC:  0.9478667588295318
RMSE:  0.22897568333714344
Precision Score:  0.9322033898305084


https://medium.com/@nitin9809/lightgbm-binary-classification-multi-class-classification-regression-using-python-4f22032b36a2