# Machine Learning (Decision Tree, and gradient boosting CatBoost, LightGBM)

Create machine learning object, train the model, and predict output.

In [1]:
import numpy as np
import pandas as pd

# Load cleaned datasets
train = pd.read_csv("train4.csv", low_memory=False)
pred = pd.read_csv("test4.csv", low_memory=False)

# Make sure that all varibles have float values.
train = train.astype(dtype=float, inplace=True)
pred = pred.astype(dtype=float, inplace=True)

# Fill missing values with -999.0.
train.fillna(value=-999.0, inplace=True)
pred.fillna(value=-999.0, inplace=True)

# Generate X, y, and X_test datasets.
train_array = train.values
X = train_array[:, 0:train.shape[1]-1]
y = train_array[:, train.shape[1]-1]

X_pred = pred.values

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score

# Split X into training and validation datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=88)

In [9]:
from sklearn.tree import DecisionTreeClassifier

# Create decision tree classification object
tree = DecisionTreeClassifier(max_depth=5, random_state=88)

# Train the model on training sets
tree.fit(X_train, y_train)


# Compare the accuracy scores of training and testing sets
print("Accuracy score (training data): {0:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy score (test data): {0:.3f}".format(tree.score(X_test, y_test)))
print()
# Check cross validation score
print("Cross validation - Accuracy score: {0:.3f}".format(np.mean(cross_val_score(tree, X_test, y_test, scoring="accuracy", cv=10))))


Accuracy score (training data): 0.985
Accuracy score (test data): 0.984

Cross validation - Accuracy score: 0.984


In [10]:
# Predict output
y_predicted = tree.predict(X_pred)

# Count the values of response variable in prediction result
count_0 = 0
count_1 = 0
for item in y_predicted:
    if item == 0.0:
        count_0 += 1
    else:
        count_1 += 1
print("Prediction result")
print("Counts of N:", count_0)
print("Counts of Y:", count_1)

Prediction result
Counts of N: 199989
Counts of Y: 11


In [4]:
# Output prediction as submission.csv file
# Submission should contain only two columns: CUSTOMER_ID and RESPONDERS
customer_id_df = pd.read_csv("pred_customer_id.csv", low_memory=False, header=None)
customer_id = customer_id_df.values
submission = pd.DataFrame({"CUSTOMER_ID": customer_id[:, 0], "RESPONDERS": y_predicted}, columns=["CUSTOMER_ID", "RESPONDERS"])
submission.loc[:, "RESPONDERS"].replace(to_replace=[0.0, 1.0], value=["N", "Y"], inplace=True)
submission.to_csv("submisson.csv", index=None)

In [5]:
# Create zip file for submission.csv
from zipfile import ZipFile
with ZipFile("submission.zip", "w") as myzip:
    myzip.write("submisson.csv")
myzip.close()

In [5]:
from catboost import CatBoostClassifier

# Create CatBoost classification object
parameters = {"iterations":200, 
              "learning_rate": 0.02, 
              "depth": 6, 
              "l2_leaf_reg":3}

cb = CatBoostClassifier(**parameters)

# Train the model on training sets
cb.fit(X_train, y_train)

# Compare the accuracy scores of training and testing sets.
print("Accuracy score (training data): {0:.3f}".format(cb.score(X_train, y_train)))
print("Accuracy score (testing data): {0:.3f}".format(cb.score(X_test, y_test)))
print()
# Check cross validation score
print("Cross validation score: {0:.3f}".format(round(np.mean(cross_val_score(cb, X_test, y_test, scoring="accuracy", cv=10)), 4)))

Accuracy score (training data): 0.985
Accuracy score (testing data): 0.984

Cross validation score: 0.985


CatBoostClassifier is the slowest among the three classifier used here on large datasets.

In [6]:
# Predict output
y_predicted = cb.predict(X_pred)

# Count the values of response variable in prediction result
count_0 = 0
count_1 = 0
for item in y_predicted:
    if item == 0.0:
        count_0 += 1
    else:
        count_1 += 1
print("Prediction result")
print("Counts of N:", count_0)
print("Counts of Y:", count_1)

Prediction result
Counts of N: 200000
Counts of Y: 0


In [3]:
from lightgbm import LGBMClassifier

# Train with gradient boosting LightGBM algorithm and calculate scores
param = {"num_leaves":255, "max_depth":8, "learning_rate":0.05, "max_bin":255}
lgbm = LGBMClassifier(**param)

# Train the model on training sets
lgbm.fit(X_train, y_train)

# Compare the accuracy scores of training and testing sets.
print("Accuracy score (training data): {0:.3f}".format(lgbm.score(X_train, y_train)))
print("Accuracy score (testing data): {0:.3f}".format(lgbm.score(X_test, y_test)))
print()
# Check cross validation score
print("Cross validation score: {0:.3f}".format(round(np.mean(cross_val_score(lgbm, X_test, y_test, scoring="accuracy", cv=10)), 4)))

Accuracy score (training data): 0.985
Accuracy score (testing data): 0.984

Cross validation score: 0.985


LightGBM is the most fast among the three on large datasets.

In [4]:
# Predict output
y_predicted = lgb.predict(X_pred)

# Count the values of response variable in prediction result
count_0 = 0
count_1 = 0
for item in y_predicted:
    if item == 0.0:
        count_0 += 1
    else:
        count_1 += 1
print("Prediction result")
print("Counts of N:", count_0)
print("Counts of Y:", count_1)

Prediction result
Counts of N: 199993
Counts of Y: 7
