## Set up packages and Kaggle connection

In [None]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import os
from tqdm import tqdm
import xgboost as xgb


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jacobbraun","key":"392939438edcd0495f527be30174d4ca"}'}

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

!kaggle competitions download -c santander-customer-transaction-prediction

Downloading santander-customer-transaction-prediction.zip to /content
 96% 241M/250M [00:01<00:00, 188MB/s]
100% 250M/250M [00:01<00:00, 197MB/s]


In [None]:
!unzip santander-customer-transaction-prediction

Archive:  santander-customer-transaction-prediction.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


## Load data and identify fake rows

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
# Generate new features to check if each value is unique within each original
# column

col_names = [f'var_{i}' for i in range(200)]
for column in tqdm(col_names):
  c = test[column].value_counts()
  u = c.index[c == 1]
  test[column + '_u'] = test[column].isin(u)


  
100%|██████████| 200/200 [00:03<00:00, 60.02it/s]


In [None]:
# Add a column checking if at least one feature is unique for each row
test['unique'] = test[[column + '_u' for column in col_names]].any(axis=1)

In [None]:
# Separate out real test data and fake test data
test_real = test.loc[test.unique, ['ID_code'] + col_names]
test_fake = test.loc[~test.unique, ['ID_code'] + col_names]

In [None]:
len(test_real), len(test_fake)

(100000, 100000)

In [None]:
# Combine all the 'real' data from the training and testing set so we can see if
# there are any fakes once they're combined

realTrTe = pd.concat([train, test_real], axis = 0)

In [None]:
# Generate another set of binary features to check if each value is unique 
# within each original feature column
for column in tqdm(col_names):
  c = realTrTe[column].value_counts()
  u = c.index[c == 1]
  realTrTe[column + '_unique'] = realTrTe[column].isin(u)*1
  test_fake[column + '_unique'] = 0

  
  import sys
100%|██████████| 200/200 [00:06<00:00, 30.40it/s]


In [None]:
# From the combined set, isolate the real test values
test_real = realTrTe[realTrTe['ID_code'].str.contains('test')].copy()
test_real.drop(['target'], axis=1, inplace=True)

# Create a 'train' df with the new unique identifying features
# Create a 'test' df, combining the real and fake testing features
train = realTrTe[realTrTe['ID_code'].str.contains('train')].copy()
test = pd.concat([test_real, test_fake], axis=0)

In [None]:
train.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190_unique,var_191_unique,var_192_unique,var_193_unique,var_194_unique,var_195_unique,var_196_unique,var_197_unique,var_198_unique,var_199_unique
0,train_0,0.0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,0,0,0,0,0,0,0,0,0,0
1,train_1,0.0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,0,0,0,0,0,0,0,0,0,0
2,train_2,0.0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,0,0,0,0,0,0,0,0,0,0
3,train_3,0.0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,0,0,0,0,0,0,0,0,0,0
4,train_4,0.0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,0,0,1,1,1,0,0,0,0,0


In [None]:
test.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190_unique,var_191_unique,var_192_unique,var_193_unique,var_194_unique,var_195_unique,var_196_unique,var_197_unique,var_198_unique,var_199_unique
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,0,0,0,0,0,0,0,0,0,1
7,test_7,17.3035,-2.4212,13.3989,8.3998,11.0777,9.6449,5.9596,17.8477,-4.8068,...,0,0,0,0,0,0,0,0,1,0
11,test_11,10.6137,-2.1898,8.909,3.8014,13.8602,-5.9802,5.5515,15.4716,-0.1714,...,0,0,0,0,0,0,1,0,0,1
15,test_15,14.8595,-4.5378,13.6483,5.648,9.9144,1.519,5.0358,13.4524,-2.5419,...,0,0,1,0,0,0,0,0,0,1
16,test_16,14.1732,-5.149,9.7591,3.7316,10.37,-21.9202,7.713,18.8749,0.468,...,0,0,0,0,0,0,0,0,0,0


### Create training, validation, and testing splits:

In [None]:
# Split the training data into X and Y dataframes
X = train.iloc[:, 2:]
Y = train['target']

# Generate training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X, 
                                                  Y, 
                                                  test_size=.2, 
                                                  random_state=4, 
                                                  stratify=Y)

# Create the testing dataset for prediction
X_test = test.drop('ID_code', axis=1)
