In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc
import gc
import scipy.ndimage
from fastai.tabular.all import *
from google.colab import drive

Importing train.csv and test.csv

In [None]:
train = pd.read_csv("/content/drive/MyDrive/STAT4450_contest2_data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/STAT4450_contest2_data/test.csv")

Removing fake results from test dataframe

In [None]:
test_codes = test["ID_code"].values
test.drop(["ID_code"], axis=1, inplace=True)
test = test.values

unique_count = np.zeros_like(test)  # same col & row size
for i in tqdm(range(test.shape[1])):
    _, index_, count_ = np.unique(test[:, i], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], i] += 1

real_idx = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synth_idx = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

test_length = test.shape[0]
test_synth = test[synth_idx, :]
test = test[real_idx, :]

# 2. Append real test to training
target = np.array(list(train["target"].values))
train_length = train.shape[0]
train.drop(["ID_code"], axis=1, inplace=True)
train.drop(["target"], axis=1, inplace=True)
full = pd.DataFrame(np.concatenate([train.values, test]), columns=train.columns)
synth = pd.DataFrame(test_synth, columns=train.columns)
print(full.shape)

# Reverse features with negative correlation
features = [x for x in full.columns if x.startswith("var")]
for var in features:
    if np.corrcoef(target, train[var])[1][0] < 0:
        full[var] = full[var] * -1

# Getting Counts, Density, Deviation
sigma_fac = 0.001
sigma_base = 4
eps = 0.00000001

 75%|███████▌  | 150/200 [00:05<00:01, 30.82it/s]

Defining a function to get feature counts

In [None]:
def get_features(data):
    # Get counts
    features_count = np.zeros((data.shape[0], len(features)))
    features_density = np.zeros((data.shape[0], len(features)))
    features_deviation = np.zeros((data.shape[0], len(features)))

    sigmas = []
    for i, var in enumerate(tqdm(features)):
        data_int = (data[var].values * 10000).round().astype(int)
        low = data_int.min()
        data_int -= low
        high = data_int.max() + 1
        counts_data = np.bincount(data_int, minlength=high).astype(float)

        # Geometric mean of twice sigma_base and a sigma_scaled which is scaled to the length of array
        sigma_scaled = counts_data.shape[0] * sigma_fac
        sigma = np.power(sigma_base * sigma_base * sigma_scaled, 1 / 3)
        sigmas.append(sigma)
        counts_data_smooth = scipy.ndimage.filters.gaussian_filter1d(counts_data, sigma)
        deviation = counts_data / (counts_data_smooth + eps)
        indices = data_int
        features_count[:, i] = counts_data[indices]
        features_density[:, i] = counts_data_smooth[indices]
        features_deviation[:, i] = deviation[indices]

    features_count_names = [var + "_count" for var in features]
    features_density_names = [var + "_density" for var in features]
    features_deviation_names = [var + "_deviation" for var in features]

    data_count = pd.DataFrame(columns=features_count_names, data=features_count)
    data_count.index = data.index
    data_density = pd.DataFrame(columns=features_density_names, data=features_density)
    data_density.index = data.index
    data_deviation = pd.DataFrame(
        columns=features_deviation_names, data=features_deviation
    )
    data_deviation.index = data.index
    data = pd.concat([data, data_count, data_density, data_deviation], axis=1)

    features_count = features_count_names
    features_density = features_density_names
    features_deviation = features_deviation_names

    return data, features_count, features_density, features_deviation

Getting the features and standardizing features

In [5]:
full, features_count, features_density, features_deviation = get_features(full)
(
    synth,
    fake_features_count,
    fake_features_density,
    fake_features_deviation,
) = get_features(synth)
print(full.shape)
print(f"test_synth: {synth.shape}")

# Standardizing the features
features_to_scale = [features, features_count]

scaler = StandardScaler()
features_to_scale_flatten = [var for sublist in features_to_scale for var in sublist]
scaler.fit(full[features_to_scale_flatten])
features_scaled = scaler.transform(full[features_to_scale_flatten])
full[features_to_scale_flatten] = features_scaled
print(full.shape)

# Split back into train and test
train = full.iloc[:train_length, :]
test = full.iloc[train_length:, :]
del full

gc.collect()
print(train.shape, test.shape)

  counts_data_smooth = scipy.ndimage.filters.gaussian_filter1d(counts_data, sigma)
100%|██████████| 200/200 [00:12<00:00, 16.21it/s]
  counts_data_smooth = scipy.ndimage.filters.gaussian_filter1d(counts_data, sigma)
100%|██████████| 200/200 [00:08<00:00, 23.78it/s]


(300000, 800)
test_synth: (100000, 800)
(300000, 800)
(200000, 800) (100000, 800)


Light Gradient Boosting Machine (LGBM) 

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    train, target, test_size=0.3, random_state=123
)

train_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_val, label=y_val)


# Param notes:
# 1. base
# 2. reg_lambda to 0.1 improve by 3%, num_leaves = 16
# 3. reg_lambda to 0.5
# 4. num_leaves to 64
# 5. num_leaves to 128, max_bin to 512
# 6. learning_rate to 0.05, num_rounds to 500
# 7. feature_fraction to 0.9
# 8*. Kazuki Onodera's submission params

# num_rounds = 500
# params = {
#     "boost_from_average": "false",
#     "boost": "gbdt",
#     "feature_fraction": 0.9,
#     "learning_rate": 0.05,
#     "max_depth": -1,
#     "metric": "binary_logloss",
#     "num_leaves": 64,
#     "num_threads": 4, #Only 4 Colab Pro cores
#     "tree_learner": "serial",
#     "objective": "binary",
#     "reg_alpha": 2, #Reduces features
#     "reg_lambda": 0.5, 
#     "verbosity": 1,
#     "max_bin": 512,
# }

# 8* model params below:
num_rounds = 1600
params = {
    'bagging_freq': 5,
    'bagging_fraction': 1.0,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 1.0,
    'learning_rate': 0.05,
    'max_depth': -1,
    'metric':'binary_logloss',
    'min_data_in_leaf': 30,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 64,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': -1
    }

# train the model
model = lgb.train(params, train_data, num_rounds, valid_sets=[val_data])

# make predictions on the validation data
# y_pred = model.predict(x_val)

# print the accuracy
# accuracy = accuracy_score(y_val, y_pred.round().astype(int))
# print(accuracy)

# Make final prediction for test
sub = pd.DataFrame({"ID_code": test_codes})

pred_real = model.predict(test)
pred_synth = model.predict(synth)

preds_all = np.zeros(test_length)
preds_all[real_idx] = pred_real
preds_all[synth_idx] = pred_synth
sub["target"] = preds_all
sub.to_csv("/content/drive/MyDrive/STAT4450_contest2_data/lgbsubmission7.csv", index=False)
print(sub.head(20))

[1]	valid_0's binary_logloss: 0.689868
[2]	valid_0's binary_logloss: 0.686627
[3]	valid_0's binary_logloss: 0.683414
[4]	valid_0's binary_logloss: 0.680231
[5]	valid_0's binary_logloss: 0.677076
[6]	valid_0's binary_logloss: 0.673958
[7]	valid_0's binary_logloss: 0.670867
[8]	valid_0's binary_logloss: 0.667803
[9]	valid_0's binary_logloss: 0.664772
[10]	valid_0's binary_logloss: 0.66177
[11]	valid_0's binary_logloss: 0.658795
[12]	valid_0's binary_logloss: 0.655857
[13]	valid_0's binary_logloss: 0.652941
[14]	valid_0's binary_logloss: 0.650054
[15]	valid_0's binary_logloss: 0.647197
[16]	valid_0's binary_logloss: 0.644369
[17]	valid_0's binary_logloss: 0.641564
[18]	valid_0's binary_logloss: 0.638785
[19]	valid_0's binary_logloss: 0.63603
[20]	valid_0's binary_logloss: 0.633303
[21]	valid_0's binary_logloss: 0.630602
[22]	valid_0's binary_logloss: 0.627925
[23]	valid_0's binary_logloss: 0.625274
[24]	valid_0's binary_logloss: 0.622646
[25]	valid_0's binary_logloss: 0.620044
[26]	valid_

Neural Network from Scratch

In [None]:
from re import X
# Predicting with a neural net
df_train = pd.concat([pd.DataFrame(train),pd.DataFrame(target)], axis=1)
df_train.rename(columns={0:'target'},inplace=True)
splits = RandomSplitter()(range_of(df_train))
y_names = 'target'
cont_names = df_train.columns.tolist()[-len(df_train.columns.tolist())]
procs = [FillMissing, Normalize]
y_block = CategoryBlock()

# Building TabularPandas
to = TabularPandas(df_train.reset_index(), procs=procs, cont_names=cont_names, y_names=y_names, y_block=y_block, splits=splits)

## creating a dataloader
dls = to.dataloaders()

learn = tabular_learner(dls,[200,100],metrics=accuracy)

# Training the NN
learn.fit_one_cycle(10)
learn.lr_find()
result = learn.validate()
print(result)

In [None]:
# TODO: Make predictions for submit to Kaggle
df_test = pd.concat([pd.DataFrame(test),pd.DataFrame(synth)],axis=0)
dl = learn.dls.test_dl(df_test)
preds = learn.get_preds(dl=dl)
result = learn.validate()


In [None]:
target = pd.DataFrame(preds[0][:,0].tolist())
ID_codes = pd.DataFrame(pd.read_csv("/content/drive/MyDrive/STAT4450_contest2_data/test.csv")['ID_code'])
submission0 = pd.concat([ID_codes,target],axis=1)
submission0.columns = list(["ID_code","target"])
submission0.to_csv("/content/drive/MyDrive/STAT4450_contest2_data/submission0.csv", index=False)