In [224]:
!conda info


     active environment : kaggle-pgoct21
    active env location : C:\ProgramData\Anaconda3\envs\kaggle-pgoct21
            shell level : 1
       user config file : C:\Users\globetrekker\.condarc
 populated config files : C:\Users\globetrekker\.condarc
          conda version : 4.10.1
    conda-build version : 3.21.4
         python version : 3.8.8.final.0
       virtual packages : __win=0=0
                          __archspec=1=x86_64
       base environment : C:\ProgramData\Anaconda3  (writable)
      conda av data dir : C:\ProgramData\Anaconda3\etc\conda
  conda av metadata url : https://repo.anaconda.com/pkgs/main
           channel URLs : https://repo.anaconda.com/pkgs/main/win-64
                          https://repo.anaconda.com/pkgs/main/noarch
                          https://repo.anaconda.com/pkgs/r/win-64
                          https://repo.anaconda.com/pkgs/r/noarch
                          https://repo.anaconda.com/pkgs/msys2/win-64
                          https

In [225]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc, time, copy
import utility as ut

In [226]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

In [1]:
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

ModuleNotFoundError: No module named 'lightgbm'

In [228]:
df_train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')

memory_usage = df_train.memory_usage(deep=True) / 1024 ** 2
start_mem = memory_usage.sum()

#### Resize models

In [229]:
ids = df_test['id']
df_train.drop('id', axis=1, inplace=True)
df_test.drop('id', axis=1, inplace=True)

In [230]:
feature_cols = [col for col in df_test.columns.tolist()]

continuous_features =[]
categorical_features =[]

for col in feature_cols:
    if df_train[col].dtype=='float64':
        continuous_features.append(col)
    else:
        categorical_features.append(col)
        

df_train[continuous_features] = df_train[continuous_features].astype('float32')
df_train[categorical_features] = df_train[categorical_features].astype('uint8')

df_test[continuous_features] = df_test[continuous_features].astype('float32')
df_test[categorical_features] = df_test[categorical_features].astype('uint8')

memory_usage = df_in.memory_usage(deep=True) / 1024 ** 2
end_mem = memory_usage.sum()

In [231]:
print('Start mem: ' + str(start_mem))
print('End mem: ' + str(end_mem))

Start mem: 2189.6363525390625
End mem: 970.840576171875


#### Initial scores

In [232]:
cols = list(df_test.columns)
X = df_train[cols]
y = df_train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, train_size=0.8)

In [233]:
%%time

model = LGBMClassifier(verbose=-1, random_state=5, n_estimators=400) # parameters through hyperparameter tuning
score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
print(score)
# 0.8550085230381302

0.8550085230381302
Wall time: 14min 34s


#### Get important features

In [234]:
featureScores = ut.get_feature_importances(df_train, 'target', 'classification', 5)
featureScores['Abs_score'] = featureScores['Score'] * -1
important_features = list(featureScores.sort_values(by='Abs_score').head(15)['Specs'])

print(important_features)

['f22', 'f179', 'f69', 'f156', 'f58', 'f214', 'f78', 'f136', 'f8', 'f43', 'f247', 'f200', 'f77', 'f3', 'f134']


#### KMeans clustering

In [235]:
n_clusters = 10

In [236]:
%%time

kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(X_train[important_features])

Wall time: 29.4 s


KMeans(n_clusters=10, random_state=0)

In [237]:
# kmeans.cluster_centers_

In [238]:
# kmeans.labels_

#### Get score by adding label

In [239]:
%%time

X_train['cluster'] = kmeans.predict(X_train[important_features])
score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
print(score)
# 0.8550200668172574

0.8550200668172574
Wall time: 18min 33s


#### Get score by adding cluster distances

In [240]:
%%time

X_train = X_train[cols]
X_test = X_test[cols]
cluster_cols = [f"cluster{i+1}" for i in range(n_clusters)]

train_cluster_distances = kmeans.transform(X_train[important_features])
X_train_cluster_distances = pd.DataFrame(train_cluster_distances, columns=cluster_cols, index=X_train.index)
X_train = X_train.join(X_train_cluster_distances)

Wall time: 1.82 s


In [241]:
%%time

score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
print(score)
# 0.8549983723571334

0.8549983723571334
Wall time: 19min 17s


#### Get score by adding distance ratios

In [242]:
%%time

X_train = X_train[cols]
X_test = X_test[cols]

cluster_cols = [f"cluster{i+1}" for i in range(n_clusters)]

train_cluster_distances = kmeans.transform(X_train[important_features])
X_train_cluster_distances = pd.DataFrame(train_cluster_distances, columns=cluster_cols, index=X_train.index)

new_cols = []
for i in cluster_cols:
    for j in cluster_cols:
        if i != j:
            new_col_name = i + '_' + j
            X_train_cluster_distances[new_col_name] = X_train_cluster_distances[i] / X_train_cluster_distances[j]
            new_cols.append(new_col_name)
            
X_train = X_train.join(X_train_cluster_distances[new_cols])

Wall time: 5.47 s


In [243]:
%%time

score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
print(score)
# 0.8547104364915882

0.8547104364915882
Wall time: 22min 7s


#### Remove some less important features

In [249]:
%%time

X_train = X_train[cols]
X_test = X_test[cols]

less_important_features = list(featureScores.sort_values(by='Abs_score').tail(60)['Specs'])
X_temp = copy.deepcopy(X_train)
X_temp['cluster'] = kmeans.predict(X_temp[important_features])
X_temp.drop(less_important_features, axis=1, inplace=True)

score = cross_val_score(model, X_temp, y_train, cv=5, scoring='roc_auc').mean()
print(score)

0.8549385595453453
Wall time: 13min 7s


#### Get score by adding cluster label and distance ratios

In [None]:
%%time

X_train = X_train[cols]
X_test = X_test[cols]

X_train['cluster'] = kmeans.predict(X_train[important_features])

cluster_cols = [f"cluster{i+1}" for i in range(n_clusters)]

train_cluster_distances = kmeans.transform(X_train[important_features])
X_train_cluster_distances = pd.DataFrame(train_cluster_distances, columns=cluster_cols, index=X_train.index)

new_cols = []
for i in cluster_cols:
    for j in cluster_cols:
        if i != j:
            new_col_name = i + '_' + j
            X_train_cluster_distances[new_col_name] = X_train_cluster_distances[i] / X_train_cluster_distances[j]
            new_cols.append(new_col_name)
            
X_train = X_train.join(X_train_cluster_distances[new_cols])

In [None]:
%%time

score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc').mean()
print(score)

#### Final X_val modification

In [250]:
%%time

X_train = X_train[cols]
X_val = X_val[cols]

kmeans = KMeans(n_clusters=n_clusters, random_state=0)

# add cluster class labels
kmeans.fit(X_train[important_features])
X_train['cluster'] = kmeans.predict(X_train[important_features])
X_val['cluster'] = kmeans.predict(X_val[important_features])

# add cluster distance ratios - train
cluster_cols = [f"cluster{i+1}" for i in range(n_clusters)]

cluster_distances = kmeans.transform(X_train[important_features])
X_train_cluster_distances = pd.DataFrame(cluster_distances, columns=cluster_cols, index=X_train.index)

new_cols = []
for i in cluster_cols:
    for j in cluster_cols:
        if i != j:
            new_col_name = i + '_' + j
            X_train_cluster_distances[new_col_name] = X_train_cluster_distances[i] / X_train_cluster_distances[j]
            new_cols.append(new_col_name)
            
X_train = X_train.join(X_train_cluster_distances[new_cols])

# add cluster distance ratios - val
cluster_distances_val = kmeans.transform(X_val[important_features])
X_val_cluster_distances = pd.DataFrame(cluster_distances_val, columns=cluster_cols, index=X_val.index)

new_cols = []
for i in cluster_cols:
    for j in cluster_cols:
        if i != j:
            new_col_name = i + '_' + j
            X_val_cluster_distances[new_col_name] = X_val_cluster_distances[i] / X_val_cluster_distances[j]
            new_cols.append(new_col_name)
            
X_val = X_val.join(X_val_cluster_distances[new_cols])

Wall time: 17.8 s


In [None]:
%%time

model = LGBMClassifier(verbose=-1, random_state=5, n_estimators=400)
model.fit(X_train, y_train)
preds = model.predict(X_val)
score = roc_auc_score(y_val, preds)
print(score)
# 0.7673037557447102

#### Final test set modification

In [254]:
%%time

df_test = df_test[cols]
# add cluster class labels
df_test['cluster'] = kmeans.predict(df_test[important_features])

# add cluster distance ratios - test
cluster_cols = [f"cluster{i+1}" for i in range(n_clusters)]
cluster_distances_test = kmeans.transform(df_test[important_features])
df_test_cluster_distances = pd.DataFrame(cluster_distances_test, columns=cluster_cols, index=df_test.index)

new_cols = []
for i in cluster_cols:
    for j in cluster_cols:
        if i != j:
            new_col_name = i + '_' + j
            df_test_cluster_distances[new_col_name] = df_test_cluster_distances[i] / df_test_cluster_distances[j]
            new_cols.append(new_col_name)
            
df_test = df_test.join(df_test_cluster_distances[new_cols])

Wall time: 2.42 s


#### Final test prediction

In [256]:
preds = model.predict(df_test)
timestamp = time.time()
timestamp_str = str(time.strftime("%Y%m%d_%H%M%S", time.gmtime(timestamp)))
filename = 'output_' + timestamp_str + '.csv'
print(filename)
output = pd.DataFrame({'id': ids, 'target': preds})
output.to_csv(filename, index=False)