In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Importing libraries
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings

In [4]:
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [5]:
path = "/content/drive/MyDrive/Pred Project/"
# path = ""

In [6]:
#Reading in the train and test files
df_train = pd.read_csv(path + "train.csv")
df_test = pd.read_csv(path + "test.csv")

In [7]:
df_train.drop(["ID_code"], axis = 1, inplace=True)

In [8]:
# Identifying the synthetic data from the test data
test = df_test.drop(["ID_code"], axis = 1)
unique_count = np.zeros_like(test)
for feature in range(test.shape[1]):
    _, index, count = np.unique(test.iloc[:, feature], return_counts=True, 
                                return_index=True)
    unique_count[index[count == 1], feature] += 1
real_samples = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synth_samples = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]
print("Number of real samples in test set is " + str(len(real_samples)))
print("Number of synthetic samples in test set is " + str(len(synth_samples)))

Number of real samples in test set is 100000
Number of synthetic samples in test set is 100000


In [9]:
#Getting the value counts magic features for each feature but only using the
  #train data and the real samples from test set

In [10]:
features = [col for col in df_train.columns if col.startswith('var')]
df_all = pd.concat([df_train, df_test.iloc[real_samples]])
for feature in features:
    temp = df_all[feature].value_counts(dropna=True)
    df_all[feature + 'vc'] = df_all[feature].map(temp).map(
                                  lambda x: min(10, x)).astype(np.uint8)
    df_all[feature + 'sum'] = ((df_all[feature]) * df_all[feature + 'vc'] \
                                  .map(lambda x: int(x > 1))).astype(np.float32)
    df_all[feature + 'sum2'] = ((df_all[feature]) * df_all[feature + 'vc'] \
                                 .map(lambda x: int(x > 2))).astype(np.float32)
    df_all[feature + 'sum3'] = ((df_all[feature]) * df_all[feature + 'vc'] \
                                 .map(lambda x: int(x > 4))).astype(np.float32)
    df_all[feature+"_var"] = df_all.groupby([feature])[feature].transform("var")
    df_all[feature + "plus_"] = df_all[feature] + df_all[feature + "_var"]
    df_all.drop([feature + "_var"], axis = 1, inplace = True)
df_train = df_all.iloc[:df_train.shape[0]]
df_test_real = df_all.iloc[df_train.shape[0]:]

In [11]:
print("Training set shape after creating magic features: " + df_train.shape)
print("Test set shape after creating magic features: " + df_test_real.shape)

Training set shape after creating magic features: (200000, 1202)
Test set shape after creating magic features: (100000, 1202)


In [12]:
train_y = df_train["target"]
df_train.drop(["target"], axis = 1, inplace = True)

In [13]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4242)

In [14]:
#Parameters tuned from hyperparameter tuning files

In [15]:
param = {
    'learning_rate': 0.07293475148427585, 'max_bin': 100, 'max_depth': 48, 
    'num_leaves': 2, 'min_data_in_leaf': 144, 'reg_alpha': 0.0001,
    'subsample': 0.38127455432520496,  
    'reg_lambda': 0.0001, 'min_split_gain': 0.001,
    'min_sum_hessian_in_leaf': 1.0, 'boost_from_average': True, 
    'is_unbalance': 'true', 'boosting_type': 'gbdt', 'n_estimators': 5000, 
    'num_class': 2, 'objective': 'multiclass', 'metric': 'multi_logloss', 
    'device': 'cpu', 'n_jobs': -1, 'verbose': -1
}

In [16]:
real_test = df_test_real.loc[real_samples, :]

In [21]:
#Training over folds and features and using an average predictions over them 
predictions = np.zeros(len(real_test))
k = 5
folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
for fold_, (trn_, val_) in enumerate(folds.split(train_y, train_y)):
    for feature in features:
      X_tr = lgb.Dataset(
          df_train[[feature, feature + "plus_", feature + "vc", feature + "sum", 
                      feature + "sum2", feature + "sum3"]].iloc[trn_], 
                    train_y.iloc[trn_])
      X_va = lgb.Dataset(
          df_train[[feature, feature + "plus_", feature + "vc", feature + "sum", 
                      feature + "sum2", feature + "sum3"]].iloc[val_], 
                    train_y.iloc[val_])
      model = lgb.train(param, X_tr, valid_sets=X_va, num_boost_round=1000, 
                          verbose_eval=None, early_stopping_rounds=10)
      #Creating predictions as an average over all features and folds
      predictions += model.predict(
          real_test[[feature, feature+"plus_", feature + "vc", feature + "sum", 
              feature + "sum2", feature + "sum3"]])[:, 1] / (len(features) * k)

In [18]:
#Creating a submission file
real_samples = ["test_" + str(x) for x in real_samples]
subreal = pd.DataFrame({"ID_code": real_samples})
subreal['target']=predictions
sub = pd.DataFrame({"ID_code": df_test.ID_code.values})
finalsub = sub.set_index('ID_code').join(subreal.set_index('ID_code')).reset_index()
finalsub.fillna(0,inplace=True)
finalsub.to_csv("submission.csv", index=False)