In [8]:
import pandas as pd
import numpy as np
import gc
import random
random.seed(2018)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

import lightgbm as lgb
import xgboost as xgb

from IPython.display import display # Allows the use of display() for DataFrames

import warnings
warnings.filterwarnings('ignore')

In [43]:
def removeConstCols(X_train, X_test):
    colsToRemove = []
    for col in X_train.columns:
        if X_train[col].std() == 0: 
            colsToRemove.append(col)

    # remove constant columns in the training set
    X_train.drop(colsToRemove, axis=1, inplace=True)

    # remove constant columns in the test set
    X_test.drop(colsToRemove, axis=1, inplace=True) 

    print("Removed `{}` Constant Columns\n".format(len(colsToRemove)))
    print(colsToRemove)
    
    return X_train, X_test


def removeDupCols(X_train, X_test):
    colsToRemove = []
    colsScaned = []
    dupList = {}

    columns = X_train.columns

    for i in range(len(columns)-1):
        v = X_train[columns[i]].values
        dupCols = []
        for j in range(i+1,len(columns)):
            if np.array_equal(v, X_train[columns[j]].values):
                colsToRemove.append(columns[j])
                if columns[j] not in colsScaned:
                    dupCols.append(columns[j]) 
                    colsScaned.append(columns[j])
                    dupList[columns[i]] = dupCols

    # remove duplicate columns in the training set
    X_train.drop(colsToRemove, axis=1, inplace=True) 

    # remove duplicate columns in the testing set
    X_test.drop(colsToRemove, axis=1, inplace=True)

    print("Removed `{}` Duplicate Columns\n".format(len(dupList)))
    print(dupList)
    
def dropSparseCols(train, test):
    flist = [x for x in train.columns if not x in ['ID','target']]
    for f in flist:
        if len(np.unique(train[f]))<2:
            train.drop(f, axis=1, inplace=True)
            test.drop(f, axis=1, inplace=True)
    return train, test



In [37]:
train_df = pd.read_csv('./Training Data/train.csv')
test_df = pd.read_csv('./Training Data/test.csv')

In [22]:
train_df.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [23]:
train_df.describe()

Unnamed: 0,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,...,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,5944923.0,14654.93,1390.895,26722.45,4530.164,26409.96,30708.11,16865.22,4669.208,2569407.0,...,467605.7,444623.9,805621.9,781296.6,143.529939,121380.9,35734.51,312374.1,92199.6,227910.0
std,8234312.0,389329.8,64283.02,569965.2,235912.4,1514730.0,577059.0,751275.6,187944.9,9610183.0,...,4068038.0,4428889.0,4513246.0,6839451.0,9584.318507,4720709.0,1614622.0,4318501.0,1635993.0,1811139.0
min,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,600000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2260000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,40000000.0,20000000.0,4000000.0,20000000.0,14800000.0,100000000.0,20708000.0,40000000.0,10400000.0,319612000.0,...,76000000.0,123588000.0,130000000.0,144400000.0,640000.0,301312000.0,106420000.0,140000000.0,61768000.0,43200000.0


In [38]:
X_train = train_df.drop(["ID", "target"], axis=1)
y_train = np.log1p(train_df["target"].values)

X_test = test_df.drop(["ID"], axis=1)

print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4991)
Test set size: (49342, 4991)


In [39]:
X_train, X_test = removeConstCols(X_train, X_test)
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Removed `256` Constant Columns

['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c', '0e410eb3d', '992e6d1d3', '90a7

In [41]:
removeDupCols(X_train, X_test)
'''
cols = X_train.columns
cols2 = X_train.T.drop_duplicates().T.columns

print list(set(cols) - set(cols2))
'''

#df2.T.drop_duplicates().T
print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Removed `4` Duplicate Columns

{'8d57e2749': ['acc5b709d', 'f333a5f60'], '34ceb0081': ['d60ddde1b'], '168b3e5bc': ['f8d75792f'], 'a765da8bc': ['912836770']}
Train set size: (4459, 4730)
Test set size: (49342, 4730)


In [45]:
X_train, X_test = dropSparseCols(X_train, X_test)

print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4730)
Test set size: (49342, 4730)


In [46]:
def add_SumZeros(train, test, features):
    flist = [x for x in train.columns if not x in ['ID','target']]
    if 'SumZeros' in features:
        train.insert(1, 'SumZeros', (train[flist] == 0).astype(int).sum(axis=1))
        test.insert(1, 'SumZeros', (test[flist] == 0).astype(int).sum(axis=1))
    flist = [x for x in train.columns if not x in ['ID','target']]

    return train, test

X_train, X_test = add_SumZeros(X_train, X_test, ['SumZeros'])

print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

In [47]:
def add_SumValues(train, test, features):
    flist = [x for x in train.columns if not x in ['ID','target']]
    if 'SumValues' in features:
        train.insert(1, 'SumValues', (train[flist] != 0).astype(int).sum(axis=1))
        test.insert(1, 'SumValues', (test[flist] != 0).astype(int).sum(axis=1))
    flist = [x for x in train.columns if not x in ['ID','target']]

    return train, test

X_train, X_test = add_SumValues(X_train, X_test, ['SumValues'])

print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4732)
Test set size: (49342, 4732)


In [48]:
def add_OtherAgg(train, test, features):
    flist = [x for x in train.columns if not x in ['ID','target','SumZeros','SumValues']]
    if 'OtherAgg' in features:
        train['Mean']   = train[flist].mean(axis=1)
        train['Median'] = train[flist].median(axis=1)
        train['Mode']   = train[flist].mode(axis=1)
        train['Max']    = train[flist].max(axis=1)
        train['Var']    = train[flist].var(axis=1)
        train['Std']    = train[flist].std(axis=1)
        
        test['Mean']   = test[flist].mean(axis=1)
        test['Median'] = test[flist].median(axis=1)
        test['Mode']   = test[flist].mode(axis=1)
        test['Max']    = test[flist].max(axis=1)
        test['Var']    = test[flist].var(axis=1)
        test['Std']    = test[flist].std(axis=1)
    flist = [x for x in train.columns if not x in ['ID','target','SumZeros','SumValues']]

    return train, test

X_train, X_test = add_OtherAgg(X_train, X_test, ['OtherAgg'])

print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

Train set size: (4459, 4738)
Test set size: (49342, 4738)


In [49]:
flist = [x for x in X_train.columns if not x in ['ID','target']]

flist_kmeans = []
for ncl in range(2,11):
    cls = KMeans(n_clusters=ncl)
    cls.fit_predict(X_train[flist].values)
    X_train['kmeans_cluster_'+str(ncl)] = cls.predict(X_train[flist].values)
    X_test['kmeans_cluster_'+str(ncl)] = cls.predict(X_test[flist].values)
    flist_kmeans.append('kmeans_cluster_'+str(ncl))
print(flist_kmeans)

print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

['kmeans_cluster_2', 'kmeans_cluster_3', 'kmeans_cluster_4', 'kmeans_cluster_5', 'kmeans_cluster_6', 'kmeans_cluster_7', 'kmeans_cluster_8', 'kmeans_cluster_9', 'kmeans_cluster_10']


In [50]:
flist = [x for x in X_train.columns if not x in ['ID','target']]

n_components = 20
flist_pca = []
pca = PCA(n_components=n_components)
x_train_projected = pca.fit_transform(normalize(X_train[flist], axis=0))
x_test_projected = pca.transform(normalize(X_test[flist], axis=0))
for npca in range(0, n_components):
    X_train.insert(1, 'PCA_'+str(npca+1), x_train_projected[:, npca])
    X_test.insert(1, 'PCA_'+str(npca+1), x_test_projected[:, npca])
    flist_pca.append('PCA_'+str(npca+1))
print(flist_pca)

print("Train set size: {}".format(X_train.shape))
print("Test set size: {}".format(X_test.shape))

['PCA_1', 'PCA_2', 'PCA_3', 'PCA_4', 'PCA_5', 'PCA_6', 'PCA_7', 'PCA_8', 'PCA_9', 'PCA_10', 'PCA_11', 'PCA_12', 'PCA_13', 'PCA_14', 'PCA_15', 'PCA_16', 'PCA_17', 'PCA_18', 'PCA_19', 'PCA_20']
Train set size: (4459, 4767)
Test set size: (49342, 4767)
