## Set up packages and Kaggle connection

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import os
from tqdm import tqdm
import xgboost as xgb
from lightgbm import LGBMClassifier


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jacobbraun","key":"392939438edcd0495f527be30174d4ca"}'}

In [4]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

!kaggle competitions download -c santander-customer-transaction-prediction

Downloading santander-customer-transaction-prediction.zip to /content
 97% 243M/250M [00:05<00:00, 33.4MB/s]
100% 250M/250M [00:05<00:00, 47.4MB/s]


In [5]:
!unzip santander-customer-transaction-prediction

Archive:  santander-customer-transaction-prediction.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


## Load data and identify fake rows

In [6]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [7]:
# Generate new features to check if each value is unique within each original
# column

col_names = [f'var_{i}' for i in range(200)]
for column in tqdm(col_names):
  c = test[column].value_counts()
  u = c.index[c == 1]
  test[column + '_u'] = test[column].isin(u)


  
100%|██████████| 200/200 [00:03<00:00, 59.84it/s]


In [8]:
# Add a column checking if at least one feature is unique for each row
test['unique'] = test[[column + '_u' for column in col_names]].any(axis=1)

In [9]:
# Separate out real test data and fake test data
test_real = test.loc[test.unique, ['ID_code'] + col_names]
test_fake = test.loc[~test.unique, ['ID_code'] + col_names]

In [10]:
len(test_real), len(test_fake)

(100000, 100000)

In [11]:
# Combine all the 'real' data from the training and testing set so we can see if
# there are any fakes once they're combined

realTrTe = pd.concat([train, test_real], axis = 0)

In [12]:
# Generate another set of binary features to check if each value is unique 
# within each original feature column
for column in tqdm(col_names):
  c = realTrTe[column].value_counts()
  u = c.index[c == 1]
  realTrTe[column + '_unique'] = realTrTe[column].isin(u)*1
  test_fake[column + '_unique'] = 0

  
  import sys
100%|██████████| 200/200 [00:06<00:00, 30.64it/s]


In [13]:
# From the combined set, isolate the real test values
test_real = realTrTe[realTrTe['ID_code'].str.contains('test')].copy()
test_real.drop(['target'], axis=1, inplace=True)

# Create a 'train' df with the new unique identifying features
# Create a 'test' df, combining the real and fake testing features
train = realTrTe[realTrTe['ID_code'].str.contains('train')].copy()
test = pd.concat([test_real, test_fake], axis=0)

### Create training, validation, and testing splits:

In [14]:
# Split the training data into X and Y dataframes
X = train.iloc[:, 2:]
Y = train['target']


# Create the testing dataset for prediction
X_test = test.drop('ID_code', axis=1)

# scaler = StandardScaler()

# X = scaler.fit_transform(X)
# X_test = scaler.fit_transform(X)


In [26]:
# initializing all the base model objects with default parameters
model_1 = LogisticRegression(class_weight='balanced',
                             solver='newton-cg',
                             verbose=1)
model_2 = xgb.XGBClassifier(max_depth=25,
                            verbose=1)
model_3 = RandomForestClassifier(n_estimators=250,
                                 min_samples_split=20)
model_4 = LGBMClassifier(learning_rate=0.04)

# putting all base model objects in one list
all_models = [('lr', model_1), ('xgb', model_2), ('rf', model_3), ('lgbm', model_4)]
 
# create meta model
final_lr = LogisticRegression(class_weight='balanced',
                              solver='newton-cg')

# stacked model
stack = StackingClassifier(estimators=all_models,
                           final_estimator=final_lr,
                           cv=None,
                           stack_method='predict_proba',
                           n_jobs=-1,
                           passthrough=True, # Train final model on predictions and base data
                           verbose=1)
 

In [None]:
stack.fit(X, Y)

In [None]:
y_pred = stack.predict_proba(X_test)[:,1]

In [None]:
submission = pd.DataFrame({"ID_code": test.iloc[:,0]})
submission["target"] = y_pred
submission.to_csv("submission415v2.csv", index=False)
!cp '/content/submission415v2.csv' '/content/drive/MyDrive/Python Data/'