**To** make sure the code could be executed, please run it on Colab



# Step 0. Set up Colab environment

In [0]:
!pip install -U -q kaggle
!mkdir -p ~/.kaggle

In [0]:
from google.colab import files
files.upload()

In [0]:
!cp kaggle.json ~/.kaggle/ 
!kaggle competitions download -c ntu-qbs-assignment-2

In [0]:
!unzip /content/test_numeric.csv.zip -d /content/data
!unzip /content/test_date.csv.zip -d /content/data
!unzip /content/test_categorical.csv.zip -d /content/data
!unzip /content/train_numeric.csv.zip -d /content/data
!unzip /content/train_date.csv.zip -d /content/data
!unzip /content/train_categorical.csv.zip -d /content/data
!unzip /content/sample.csv.zip -d /content/data

!rm /content/test_numeric.csv.zip
!rm /content/test_date.csv.zip
!rm /content/test_categorical.csv.zip
!rm /content/train_numeric.csv.zip
!rm /content/train_date.csv.zip
!rm /content/train_categorical.csv.zip
!rm /content/sample.csv.zip

In [0]:
!mkdir output

In [None]:
# Step 1. Set up Python environment

In [0]:
from zipfile import ZipFile
import pandas as pd
import numpy as np # linear algebra
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns

# Step 2. Read data

***Date data***

In [0]:
train_date = pd.read_csv('/content/data/train_date.csv').astype('float32')
test_date = pd.read_csv('/content/data/test_date.csv').astype('float32')

In [0]:
train_date = train_date.drop("Id", axis=1)
test_date = test_date.drop("Id", axis=1)

***Numerical data***

In [0]:
test_numeric = pd.read_csv('/content/data/test_numeric.csv').astype('float32')
train_numeric = pd.read_csv('/content/data/train_numeric.csv').astype('float32')

In [0]:
train_numeric = train_numeric.drop("Id", axis=1)
test_numeric = test_numeric.drop("Id", axis=1)

***Reduce momerry***

A function to reduce the ram usage by assigning the type which uses the smallest ram.

In [0]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float16)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")

        else: #string case

            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)

            props[col] = props[col].astype('category')

             # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")


    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")

    return props

***Categorical data***

In [0]:
import  numpy as np

data = pd.read_csv("/content/data/train_categorical.csv",chunksize=1)

for df in data:
    column_name = df.columns
    break

train_categorical = pd.read_csv("/content/data/train_categorical.csv",usecols=column_name[0:141])[column_name[0:141]]
train_categorical = reduce_mem_usage(train_categorical)

for i in range(0,2000,400):
    temp = pd.read_csv("/content/data/train_categorical.csv", usecols=column_name[(141+i):(541+i)])[column_name[(141+i):(541+i)]]
    train_categorical = pd.concat([train_categorical, reduce_mem_usage(temp)], axis=1, ignore_index=True)


test_categorical = pd.read_csv("/content/data/test_categorical.csv",usecols=column_name[0:141])[column_name[0:141]]
test_categorical = reduce_mem_usage(test_categorical)

for i in range(0,2000,400):
    temp = pd.read_csv("/content/data/test_categorical.csv", usecols=column_name[(141+i):(541+i)])[column_name[(141+i):(541+i)]]
    test_categorical = pd.concat([test_categorical, reduce_mem_usage(temp)], axis=1, ignore_index=True)

train_categorical.columns = column_name

# Step 3. EDA

### See if there are NA

In [0]:
train_categorical.isnull().sum()

### The distribution of 1, 0

So we could lnow how to set weight class in the model

In [0]:
train_y = train_numeric.loc[:,"Response"].astype("int16")
x_train_label_to_draw = pd.DataFrame(dict(traget=train_y))

pic_0_1 = sns.catplot(x="traget", y="traget", data=x_train_label_to_draw, kind="bar",\
                      height=4, aspect=1, estimator=lambda x: len(x) / len(x_train_label_to_draw) * 100)
                      
pic_0_1.set(ylabel="Percent")
pic_0_1.fig.suptitle("0 and 1 proportion")

print("0: {} %".format( round(x_train_label_to_draw.traget.value_counts()[0] / len(x_train_label_to_draw), 4)* 100 ) )
print("1: {} %".format( round(x_train_label_to_draw.traget.value_counts()[1] / len(x_train_label_to_draw), 4)* 100 ) )

# Step 4. Feature engineer

### Numeric



***Use XGboost to find out the important columns***

In [0]:
from xgboost import XGBClassifier
from xgboost import plot_importance

In [0]:
X = train_numeric.iloc[:,0:968]
col_list = X.columns

y = train_numeric.iloc[:,968]

X = X.values
y = np.nan_to_num(y)

clf = XGBClassifier(base_score=0.005) # use model to find important columns
clf.fit(X, y)

# threshold for a manageable number of features
fig, ax = plt.subplots(nrows=1, ncols=1)
plt.hist(clf.feature_importances_[clf.feature_importances_ > 0])
fig.savefig("/content/output/xgboost_hist.png") ## to see the important feature pic
plt.close(fig)

fig, ax = plt.subplots(figsize=(20,50))
plt.xlabel('xlabel', fontsize=18)
plt.ylabel('ylabel', fontsize=18)
plt.xticks(size=12)
plt.yticks(size=12)
myplt = plot_importance(clf, ax=ax)
fig.savefig("/content/output/xgboost_importantfeatures.png") ## to see the important feature pic
plt.close(fig)

important_indices = np.where(clf.feature_importances_ > 0.005)[0] ## to find out the important columns

important_columns = [col for i, col in enumerate(col_list) if i in important_indices]  # converts important_indices to col names

In [0]:
important_columns

***Normalise and Fill numeric NAN***

In [0]:
selected_train_numeric = train_numeric.loc[:,important_columns]
selected_test_numeric = test_numeric.loc[:,important_columns]

def normalise_and_fill(train_part, test_part):
    mean_nu = np.nanmean(train_part, axis=0)
    std_nu = np.nanstd(train_part, axis=0) 
    train_part = (train_part - mean_nu)/std_nu
    test_part = (test_part - mean_nu)/std_nu

    col_list = selected_test_numeric

    select_to_dict = []
    for (col, mean) in zip(col_list, mean_nu):
        select_to_dict.append(col)
        select_to_dict.append(mean)

    it = iter(select_to_dict) 
    fill_na_dict = dict(zip(it, it)) 

    train_part = train_part.fillna(fill_na_dict)
    test_part = test_part.fillna(fill_na_dict)

    return train_part, test_part


selected_train_numeric, selected_test_numeric = normalise_and_fill(selected_train_numeric, selected_test_numeric)

print(selected_train_numeric)
print(selected_test_numeric)

### Date

***Drop similiar columns***

I drop the similiar columns, because when training the model the similiar columns are redundant and could be harmful to the prediction because of the multicollinear

In [0]:
def getunique(df):
    now_same_data = -1
    uniqueColumnNames = []

    uniqueColumnNames.append(df.columns.values[0])
    
    # Iterate over all the columns in dataframe
    for x in range(df.shape[1]):
        if (x >= now_same_data):
            col = df.iloc[:, x]
            for y in range(x + 1, df.shape[1]):
                # Select column at yth index.
                otherCol = df.iloc[:, y]
                # Check if two columns at x  y index are equal
                if (not col.equals(otherCol)):
                    uniqueColumnNames.append(df.columns.values[y])
                    now_same_data = y
                    break

    return uniqueColumnNames

In [0]:
uniqueColumnNames = getunique(train_date)

In [0]:
print(len(uniqueColumnNames))

***Fill NA in the Date***

In [0]:
selected_train_date = train_date.loc[:, uniqueColumnNames].fillna(0)
selected_test_date = test_date.loc[:, uniqueColumnNames].fillna(0)

In [0]:
train_x = pd.concat([selected_train_numeric, selected_train_date],axis =1).to_numpy()
test_x =  pd.concat([selected_test_numeric, selected_test_date],axis =1).to_numpy()
train_y = train_numeric.loc[:,"Response"].astype("int16")
train_y = train_y.to_numpy()

In [0]:
selected_train_date.shape

# Step 5. K-Fold Validation

A function for building up the model, so that when we need to build the model, we don't need to do the redundant work.

In [0]:
from keras import models, layers, regularizers

def build_model(): # a function to build model
    model = models.Sequential()
    
    model.add(layers.Dense(80, activation="relu", kernel_regularizer = regularizers.l2(0.001), input_shape=(len(train_x[0]),) ) )
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(40, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(30, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
    model.add(layers.Dense(25, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
    model.add(layers.Dense(1, activation="sigmoid", kernel_regularizer = regularizers.l1_l2(0.001)))

    model.compile(optimizer= "rmsprop", loss= "binary_crossentropy", metrics=["accuracy"])
    
    return model

K-fold Validation

In [0]:
"""from sklearn.utils import class_weight"""

num_portion = 3
num_val_sameples = len(train_x) // num_portion
num_epochs = 10

# four lists to record loss, val_loss, acc, val_acc
all_loss = []
all_val_loss = []
all_acc = []
all_val_acc = []

for i in range(num_portion): # go through every part of validation
    print("processing fold {} ...".format(i+1))

    val_data = train_x[i*num_val_sameples : (i+1)*num_val_sameples]
    val_label = train_y[i*num_val_sameples : (i+1)*num_val_sameples]
    
    prartial_x_train = np.concatenate( [ train_x[: i*num_val_sameples], train_x[(i+1)*num_val_sameples: ] ], axis=0)

    prartial_x_train_label = np.concatenate( [ train_y[: i*num_val_sameples],\
                                                train_y[(i+1)*num_val_sameples: ] ], axis=0)
    
    model = build_model()
    class_weights = {0:0.05, 1:10} # set class_weights, so we can avoid that the model tends to predict data as 0

    history = model.fit(prartial_x_train, prartial_x_train_label, validation_data=(val_data, val_label), epochs=num_epochs,\
                        batch_size=1024, verbose=0, class_weight=class_weights)
    ''', class_weight=class_weights'''

    loss = history.history["loss"]
    val_loss = history.history["val_loss"]
    acc = history.history["accuracy"]
    val_acc = history.history["val_accuracy"]
    
    all_loss.append(loss)
    all_val_loss.append(val_loss)
    all_acc.append(acc)
    all_val_acc.append(val_acc)
    
    print()

print("< ====== K-fold Validation ends ====== >")

See the porformance and dicide the probability to transform probability into True or False

In [0]:
from sklearn.metrics import roc_auc_score

train_predict = model.predict(train_x)
roc_auc_score(train_y, train_predict) ## to see the roc auc instead of auc, because there is extremely imbalanced

In [0]:
print(train_predict.min())
print(train_predict.max())

In [0]:
standard = (train_predict.max() + train_predict.min())/2
standard

In [0]:
pred = np.zeros(len(train_predict))
pred = pred.astype(str)

for i, pred_y in enumerate(train_predict): # change the probability into 1 or 0
    if(pred_y[0] > standard): 
        pred[i] = "True"
    else:
        pred[i] = "False"

num = 0
for i in pred:
    if i == "True":
        num += 1
num

To see the loss and acurracy

In [0]:
average_all_loss = [ np.mean( [loss[i] for loss in all_loss] ) for i in range(num_epochs) ]
average_all_val_loss = [ np.mean( [val_loss[i] for val_loss in all_val_loss] ) for i in range(num_epochs) ]
average_all_acc = [ np.mean( [acc[i] for acc in all_acc] ) for i in range(num_epochs) ]
average_all_val_acc = [ np.mean( [val_acc[i] for val_acc in all_val_acc] ) for i in range(num_epochs) ]

In [0]:
plt.plot(range(len(average_all_loss)), average_all_loss, 'bo', label = "training loss")
plt.plot(range(len(average_all_val_loss)), average_all_val_loss, 'b', color='orange', label = "valiadtion loss")

plt.xlabel("Epochs")
plt.ylabel("loss")
plt.legend()
plt.show()

In [0]:
plt.clf()
plt.plot(range(len(average_all_acc)), average_all_acc, 'bo', label = "training acc")
plt.plot(range(len(average_all_val_acc)), average_all_val_acc, 'b', color='orange', label = "validation acc")

plt.xlabel("Epochs")
plt.ylabel("accuracy")
plt.legend()
plt.show()

# Step 6. Build models to predict test data

Rebuild the model by using the hyper-parameters got from K-fold validation

In [0]:
final_model = build_model()

class_weights = {0:0.05, 1:10} # set class_weights, so we can avoid that the model tends to predict data as 0

final_model.fit(train_x, train_y, epochs=5, batch_size=1024, verbose=0, class_weight=class_weights)

answer = final_model.predict(test_x)

See the min, mean and max of prediciton

In [0]:
answer

In [0]:
print(answer.min())
print(answer.max())

In [0]:
standard = (answer.max() + answer.min())
standard

Tranforms the probabilities got from prediction into Boolean

In [0]:
output = np.zeros(len(answer))
output = output.astype(str)

for i, pred_y in enumerate(answer): # change the probability into 1 or 0
    if(pred_y[0] > standard): 
        output[i] = "True"
    else:
        output[i] = "False"

See how many prediction would be "True"

In [0]:
num = 0
for i in output:
    if i == "True":
        num += 1
num

In [0]:
output.shape

# Step 7. Output the outcome to csv

In [0]:
output = pd.DataFrame(data=output, columns=["Response"])

sub_try = pd.read_csv('/content/data/sample.csv')  # read the ID of example output
sub_try.Response = output # set its Target as what we get from the prediction

sub_try.to_csv('/content/output/answer.csv', index = False) # output the outcome as csv document