In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, roc_curve, auc
import gc
import scipy.ndimage
from fastai.tabular.all import *
from google.colab import drive

Importing train.csv and test.csv

In [6]:
train = pd.read_csv("/content/drive/MyDrive/STAT4450_contest2_data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/STAT4450_contest2_data/test.csv")

Removing fake results from test dataframe

In [7]:
test_codes = test["ID_code"].values
test.drop(["ID_code"], axis=1, inplace=True)
test = test.values

unique_count = np.zeros_like(test)  # same col & row size
for i in tqdm(range(test.shape[1])):
    _, index_, count_ = np.unique(test[:, i], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], i] += 1

real_idx = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synth_idx = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

test_length = test.shape[0]
test_synth = test[synth_idx, :]
test = test[real_idx, :]

# 2. Append real test to training
target = np.array(list(train["target"].values))
train_length = train.shape[0]
train.drop(["ID_code"], axis=1, inplace=True)
train.drop(["target"], axis=1, inplace=True)
full = pd.DataFrame(np.concatenate([train.values, test]), columns=train.columns)
synth = pd.DataFrame(test_synth, columns=train.columns)
print(full.shape)

# Reverse features with negative correlation
features = [x for x in full.columns if x.startswith("var")]
for var in features:
    if np.corrcoef(target, train[var])[1][0] < 0:
        full[var] = full[var] * -1

# Getting Counts, Density, Deviation
sigma_fac = 0.001
sigma_base = 4
eps = 0.00000001

100%|██████████| 200/200 [00:07<00:00, 26.94it/s]


(300000, 200)


Defining a function to get feature counts

In [8]:
def get_features(data):
    # Get counts
    features_count = np.zeros((data.shape[0], len(features)))
    features_density = np.zeros((data.shape[0], len(features)))
    features_deviation = np.zeros((data.shape[0], len(features)))

    sigmas = []
    for i, var in enumerate(tqdm(features)):
        data_int = (data[var].values * 10000).round().astype(int)
        low = data_int.min()
        data_int -= low
        high = data_int.max() + 1
        counts_data = np.bincount(data_int, minlength=high).astype(float)

        # Geometric mean of twice sigma_base and a sigma_scaled which is scaled to the length of array
        sigma_scaled = counts_data.shape[0] * sigma_fac
        sigma = np.power(sigma_base * sigma_base * sigma_scaled, 1 / 3)
        sigmas.append(sigma)
        counts_data_smooth = scipy.ndimage.filters.gaussian_filter1d(counts_data, sigma)
        deviation = counts_data / (counts_data_smooth + eps)
        indices = data_int
        features_count[:, i] = counts_data[indices]
        features_density[:, i] = counts_data_smooth[indices]
        features_deviation[:, i] = deviation[indices]

    features_count_names = [var + "_count" for var in features]
    features_density_names = [var + "_density" for var in features]
    features_deviation_names = [var + "_deviation" for var in features]

    data_count = pd.DataFrame(columns=features_count_names, data=features_count)
    data_count.index = data.index
    data_density = pd.DataFrame(columns=features_density_names, data=features_density)
    data_density.index = data.index
    data_deviation = pd.DataFrame(
        columns=features_deviation_names, data=features_deviation
    )
    data_deviation.index = data.index
    data = pd.concat([data, data_count, data_density, data_deviation], axis=1)

    features_count = features_count_names
    features_density = features_density_names
    features_deviation = features_deviation_names

    return data, features_count, features_density, features_deviation

Getting the features and standardizing features

In [9]:
full, features_count, features_density, features_deviation = get_features(full)
(
    synth,
    fake_features_count,
    fake_features_density,
    fake_features_deviation,
) = get_features(synth)
print(full.shape)
print(f"test_synth: {synth.shape}")

# Standardizing the features
features_to_scale = [features, features_count]

scaler = StandardScaler()
features_to_scale_flatten = [var for sublist in features_to_scale for var in sublist]
scaler.fit(full[features_to_scale_flatten])
features_scaled = scaler.transform(full[features_to_scale_flatten])
full[features_to_scale_flatten] = features_scaled
print(full.shape)

# Split back into train and test
train = full.iloc[:train_length, :]
test = full.iloc[train_length:, :]
del full

gc.collect()
print(train.shape, test.shape)

  counts_data_smooth = scipy.ndimage.filters.gaussian_filter1d(counts_data, sigma)
100%|██████████| 200/200 [00:15<00:00, 13.06it/s]
  counts_data_smooth = scipy.ndimage.filters.gaussian_filter1d(counts_data, sigma)
100%|██████████| 200/200 [00:10<00:00, 19.99it/s]


(300000, 800)
test_synth: (100000, 800)
(300000, 800)
(200000, 800) (100000, 800)


Light Gradient Boosting Machine (LGBM), 70-30 Split

In [10]:
x_train, x_val, y_train, y_val = train_test_split(
    train, target, test_size=0.3, random_state=123
)

train_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_val, label=y_val)


# Param notes:
# 1. base
# 2. reg_lambda to 0.1 improve by 3%, num_leaves = 16
# 3. reg_lambda to 0.5
# 4. num_leaves to 64
# 5. num_leaves to 128, max_bin to 512
# 6. learning_rate to 0.05, num_rounds to 500
# 7. feature_fraction to 0.9
# 8*. Kazuki Onodera's submission params
# 9*. 8* combined with 7, leaving 1600 rounds
# 10*. 

# num_rounds = 500
# params = {
#     "boost_from_average": "false",
#     "boost": "gbdt",
#     "feature_fraction": 0.9,
#     "learning_rate": 0.05,
#     "max_depth": -1,
#     "metric": "binary_logloss",
#     "num_leaves": 64,
#     "num_threads": 4, #Only 4 Colab Pro cores
#     "tree_learner": "serial",
#     "objective": "binary",
#     "reg_alpha": 2, #Reduces features
#     "reg_lambda": 0.5, 
#     "verbosity": 1,
#     "max_bin": 512,
# }

# 8,9* model params below:
num_rounds = 1600
params = {
    'bagging_freq': 5,
    'bagging_fraction': 1.0,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.9,
    'learning_rate': 0.05,
    'max_depth': -1,
    'metric':'binary_logloss',
    'min_data_in_leaf': 30,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 64,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': 1,
    "max_bin": 512,
    }

In [None]:
# train the model
model = lgb.train(params, train_data, num_rounds, valid_sets=[val_data])

# make predictions on the validation data
# y_pred = model.predict(x_val)

# print the accuracy
# accuracy = accuracy_score(y_val, y_pred.round().astype(int))
# print(accuracy)

# Make final prediction for test
sub = pd.DataFrame({"ID_code": test_codes})

pred_real = model.predict(test)
pred_synth = model.predict(synth)

preds_all = np.zeros(test_length)
preds_all[real_idx] = pred_real
preds_all[synth_idx] = pred_synth
sub["target"] = preds_all
sub.to_csv("/content/drive/MyDrive/STAT4450_contest2_data/lgbsubmission10.csv", index=False)
print(sub.head(20))

LGBM, 5-Fold CV

In [7]:
# ============================================================================
# 5-Fold Cross Validation
# ============================================================================

kf = KFold(n_splits=5, shuffle=True, random_state=123)

accuracies = []
models = []
i = 0

av_preds_all = np.zeros(test_length)

# Loop through each fold
for train_index, val_index in kf.split(train):
    # Split the data into train and validation sets for this fold
    # label=pd.DataFrame(target).iloc[train_index],
    X_train = lgb.Dataset(train.iloc[train_index], label=target[train_index])
    X_val = lgb.Dataset(train.iloc[val_index], label=target[val_index])
    y_val = target[val_index]

    models.append(lgb.train(params, X_train, num_rounds, valid_sets=[X_val]))

    # Make predictions on the validation set for this fold
    y_pred = models[i].predict(train.iloc[val_index])

    # Make predictions on the test set for this fold
    # Instead of simply grabbing the highest scoring model, here we are trying to average all the models
    pred_real = models[i].predict(test)
    pred_synth = models[i].predict(synth)

    # Make predictions on the real and synthetic sets seperately then concatenate for final pred
    av_preds = np.zeros(test_length)
    av_preds[real_idx] = pred_real
    av_preds[synth_idx] = pred_synth

    av_preds_all += av_preds

    # Calculate the validation metric for this fold
    # Add the validation result to the list
    accuracy = accuracy_score(y_val, y_pred.round().astype(int))
    print(f"Accuracy on fold {i + 1}: {accuracy}")
    accuracies.append(accuracy)
    i += 1

# Calculate the average validation score across all folds
print(f"Mean accuracies: {np.mean(accuracies)}")
print(f"List of accuracies: {accuracies}")

# Select the best model to use in our final prediction
model = models[accuracies.index(max(accuracies))]

# Average the predictions made from av_preds_all
av_preds_all /= kf.get_n_splits()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1422]	valid_0's binary_logloss: 0.205211
[1423]	valid_0's binary_logloss: 0.205222
[1424]	valid_0's binary_logloss: 0.205229
[1425]	valid_0's binary_logloss: 0.205245
[1426]	valid_0's binary_logloss: 0.205259
[1427]	valid_0's binary_logloss: 0.20527
[1428]	valid_0's binary_logloss: 0.205299
[1429]	valid_0's binary_logloss: 0.2053
[1430]	valid_0's binary_logloss: 0.205327
[1431]	valid_0's binary_logloss: 0.205333
[1432]	valid_0's binary_logloss: 0.205344
[1433]	valid_0's binary_logloss: 0.205365
[1434]	valid_0's binary_logloss: 0.205381
[1435]	valid_0's binary_logloss: 0.205402
[1436]	valid_0's binary_logloss: 0.205401
[1437]	valid_0's binary_logloss: 0.205408
[1438]	valid_0's binary_logloss: 0.205419
[1439]	valid_0's binary_logloss: 0.205431
[1440]	valid_0's binary_logloss: 0.205455
[1441]	valid_0's binary_logloss: 0.205469
[1442]	valid_0's binary_logloss: 0.205486
[1443]	valid_0's binary_logloss: 0.205499
[1444]	valid_0

In [8]:
# ============================================================================
# Final predictions on test set
# ============================================================================

# Get the original id codes
sub = pd.DataFrame({"ID_code": test_codes})
av_sub = pd.DataFrame({"ID_code": test_codes})

pred_real = model.predict(test)
pred_synth = model.predict(synth)

# Make predictions on the real and synthetic sets seperately then concatenate for final pred
preds_all = np.zeros(test_length)
preds_all[real_idx] = pred_real
preds_all[synth_idx] = pred_synth

# Prediction on best result from 5-fold cv
sub["target"] = preds_all
sub.to_csv("submission.csv", index=False)
print(f"Best result sub: {sub.head(20)}")

# Prediction on average result from 5-fold cv
av_sub["target"] = av_preds_all
av_sub.to_csv("av_submission0.csv", index=False)
print(f"Average result sub: {av_sub.head(20)}")

j = 0
for i in model:
  model.save_model('/content/drive/MyDrive/STAT4450_contest2_data/models/fivecv_lgbm' + str(j) + '.txt')
  j = j + 1

Best result sub:     ID_code    target
0    test_0  1.000000
1    test_1  1.000000
2    test_2  1.000000
3    test_3  0.213761
4    test_4  1.000000
5    test_5  1.000000
6    test_6  1.000000
7    test_7  0.027681
8    test_8  1.000000
9    test_9  1.000000
10  test_10  1.000000
11  test_11  0.006097
12  test_12  1.000000
13  test_13  1.000000
14  test_14  1.000000
15  test_15  0.017834
16  test_16  0.294565
17  test_17  0.011540
18  test_18  0.026875
19  test_19  1.000000
Average result sub:     ID_code    target
0    test_0  1.000000
1    test_1  1.000000
2    test_2  1.000000
3    test_3  0.134592
4    test_4  1.000000
5    test_5  1.000000
6    test_6  1.000000
7    test_7  0.033468
8    test_8  1.000000
9    test_9  1.000000
10  test_10  1.000000
11  test_11  0.012267
12  test_12  1.000000
13  test_13  1.000000
14  test_14  1.000000
15  test_15  0.015017
16  test_16  0.322596
17  test_17  0.012207
18  test_18  0.059600
19  test_19  1.000000


Convolutional Neural Network, fastai

In [11]:
# note, lgbm model is called "model"
val_idx = x_val.index.tolist()
train_df = pd.concat([pd.DataFrame(train),pd.DataFrame(target)],axis=1)
train_df.rename(columns={0:'target'},inplace=True)
y_block = CategoryBlock()
dls = TabularDataLoaders.from_df(train_df, y_names='target',y_block=y_block,valid_idx=val_idx, bs=64)
learn = tabular_learner(dls,[200,100], metrics=[accuracy, RocAucBinary()])
learn.fit_one_cycle(3)

epoch,train_loss,valid_loss,accuracy,roc_auc_score,time
0,0.226951,0.229047,0.914467,0.868228,00:37
1,0.22481,0.221603,0.917317,0.877395,00:37
2,0.170307,0.233712,0.913167,0.872641,00:38


In [29]:
df_test = pd.DataFrame(test)
df_synth = pd.DataFrame(synth)
dl = learn.dls.test_dl(df_test)
dl2 = learn.dls.test_dl(df_synth)
preds1 = learn.get_preds(dl=dl)
preds2 = learn.get_preds(dl=dl2)

preds_all = np.zeros(test_length)
preds_all[real_idx] = preds1[0][:,1].tolist()
preds_all[synth_idx] = preds2[0][:,1].tolist()

# Prediction on best result from 5-fold cv
sub = pd.DataFrame({"ID_code": test_codes})
sub["target"] = preds_all
sub.to_csv("/content/drive/MyDrive/STAT4450_contest2_data/nnsubmission0.csv", index=False)
print(sub.head(20))

    ID_code    target
0    test_0  1.000000
1    test_1  1.000000
2    test_2  1.000000
3    test_3  0.110570
4    test_4  1.000000
5    test_5  1.000000
6    test_6  1.000000
7    test_7  0.111808
8    test_8  1.000000
9    test_9  1.000000
10  test_10  1.000000
11  test_11  0.009460
12  test_12  1.000000
13  test_13  1.000000
14  test_14  1.000000
15  test_15  0.000953
16  test_16  0.195976
17  test_17  0.008303
18  test_18  0.246055
19  test_19  1.000000


Combining LGBM and CNN

In [66]:
from sklearn.linear_model import LogisticRegression

# combining the predictions to make a new feature matrix
df_val = pd.DataFrame(x_val)
df_val.rename(columns={0:'target'},inplace=True)
dl = learn.dls.test_dl(df_val)
lgb_preds = model.predict(x_val)
cnn_preds = learn.get_preds(dl=dl)[0][:,0]
combined_preds = np.hstack((lgb_preds.reshape((-1, 1)), cnn_preds.reshape((-1, 1))))

# meta model training
# meta_model = LogisticRegression()
# meta_model.fit(combined_preds,y_val)
train_data = lgb.Dataset(combined_preds, label=y_val)
meta_model = lgb.train(params, train_data, num_rounds)

# preprocessing final test data
df_test = pd.concat([pd.DataFrame(test),pd.DataFrame(synth)],axis=0)
df_test.rename(columns={0:'target'},inplace=True)
dl = learn.dls.test_dl(df_test)

# Getting predictions to make final predictions
lgb_test_preds_real = model.predict(test)
lgb_test_preds_synth = model.predict(synth)
lgb_test_preds_both = np.concatenate([lgb_test_preds_real,lgb_test_preds_synth])
cnn_test_preds_both = learn.get_preds(dl=dl)[0][:,0]
test_combined_preds = np.hstack((lgb_test_preds_both.reshape(-1, 1), cnn_test_preds_both.reshape(-1, 1)))
final_preds = meta_model.predict(test_combined_preds)

# Make final prediction for test
sub = pd.DataFrame({"ID_code": test_codes})

preds_all = np.zeros(test_length)
preds_all[real_idx] = final_preds[real_idx]
preds_all[synth_idx] = final_preds[synth_idx]

sub["target"] = preds_all
sub.to_csv("/content/drive/MyDrive/STAT4450_contest2_data/stacksubmission1.csv", index=False)
print(sub.head(20))

ValueError: ignored

Neural Network from Scratch

In [None]:
from re import X
# Predicting with a neural net
df_train = pd.concat([pd.DataFrame(train),pd.DataFrame(target)], axis=1)
df_train.rename(columns={0:'target'},inplace=True)
splits = RandomSplitter()(range_of(df_train))
y_names = 'target'
cont_names = df_train.columns.tolist()[-len(df_train.columns.tolist())]
procs = [FillMissing, Normalize]
y_block = CategoryBlock()

# Building TabularPandas
to = TabularPandas(df_train.reset_index(), procs=procs, cont_names=cont_names, y_names=y_names, y_block=y_block, splits=splits)

## creating a dataloader
dls = to.dataloaders()

learn = tabular_learner(dls,[200,100],metrics=accuracy)

# Training the NN
learn.fit_one_cycle(10)
learn.lr_find()
result = learn.validate()
print(result)

In [None]:
# TODO: Make predictions for submit to Kaggle
df_test = pd.concat([pd.DataFrame(test),pd.DataFrame(synth)],axis=0)
dl = learn.dls.test_dl(df_test)
preds = learn.get_preds(dl=dl)
result = learn.validate()


In [None]:
target = pd.DataFrame(preds[0][:,0].tolist())
ID_codes = pd.DataFrame(pd.read_csv("/content/drive/MyDrive/STAT4450_contest2_data/test.csv")['ID_code'])
submission0 = pd.concat([ID_codes,target],axis=1)
submission0.columns = list(["ID_code","target"])
submission0.to_csv("/content/drive/MyDrive/STAT4450_contest2_data/submission0.csv", index=False)