In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import KFold
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier

In [58]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_id = test['id']


In [59]:
calc_variables = [f for f in train.columns if 'calc' in f]
train.drop(calc_variables,axis=1,inplace=True)
test.drop(calc_variables,axis=1,inplace=True)

In [60]:
cat_variables = [f for f in train.columns if 'cat' in f]
bin_variables = [f for f in train.columns if 'bin' in f]
continuous_variables = [f for f in train.columns if train[f].dtype == 'float64']

In [61]:
# Separate majority and minority classes
df_majority = train[train.target==0]
df_minority = train[train.target==1]
 
# Downsample majority class
n_samples = int(1*len(df_minority))
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=n_samples,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
train = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts

# 1    49
# 0    49
# Name: balance, dtype: int64


In [62]:
print('-----------train data--------------------')

vars_with_missing = []

for f in train.columns:
    missings = train[train[f] == -1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings/train.shape[0]
        
        print('Variable {} has {} records ({:.2%}) with missing values'.format(f, missings, missings_perc))
        
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))


print('-----------test data--------------------')

vars_with_missing = []

for f in test.columns:
    missings = test[test[f] == -1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings/test.shape[0]
        
        print('Variable {} has {} records ({:.2%}) with missing values'.format(f, missings, missings_perc))
        
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))



-----------train data--------------------
Variable ps_ind_02_cat has 44 records (0.10%) with missing values
Variable ps_ind_04_cat has 36 records (0.08%) with missing values
Variable ps_ind_05_cat has 712 records (1.64%) with missing values
Variable ps_reg_03 has 6995 records (16.12%) with missing values
Variable ps_car_01_cat has 37 records (0.09%) with missing values
Variable ps_car_02_cat has 1 records (0.00%) with missing values
Variable ps_car_03_cat has 28382 records (65.41%) with missing values
Variable ps_car_05_cat has 18181 records (41.90%) with missing values
Variable ps_car_07_cat has 1315 records (3.03%) with missing values
Variable ps_car_09_cat has 73 records (0.17%) with missing values
Variable ps_car_11 has 1 records (0.00%) with missing values
Variable ps_car_12 has 1 records (0.00%) with missing values
Variable ps_car_14 has 3243 records (7.47%) with missing values
In total, there are 13 variables with missing values
-----------test data--------------------
Variable 

In [63]:
# Dropping the variables with too many missing values
vars_to_drop = ['ps_car_03_cat', 'ps_car_05_cat']
train.drop(vars_to_drop, inplace=True, axis=1)
test.drop(vars_to_drop, inplace=True, axis=1)

cat_variables.remove('ps_car_03_cat')
cat_variables.remove('ps_car_05_cat')

In [64]:
#ps_reg_03 (continuous) has missing values for 18% of all records. Replace by the mean.
#ps_car_11 (ordinal) has only 5 records with misisng values. Replace by the mode.
#ps_car_12 (continuous) has only 1 records with missing value. Replace by the mean.
#ps_car_14 (continuous) has missing values for 7% of all records. Replace by the mean.
train['ps_reg_03'].replace(to_replace=-1, value=np.mean(train['ps_reg_03'][train['ps_reg_03'] != -1]), inplace=True)
train['ps_car_11'].replace(to_replace=-1, value=stats.mode(train['ps_car_11'][train['ps_car_11'] != -1]).mode[0], inplace=True)
train['ps_car_12'].replace(to_replace=-1, value=np.mean(train['ps_car_12'][train['ps_car_12'] != -1]), inplace=True)
train['ps_car_14'].replace(to_replace=-1, value=np.mean(train['ps_car_14'][train['ps_car_14'] != -1]), inplace=True)

test['ps_reg_03'].replace(to_replace=-1, value=np.mean(test['ps_reg_03'][test['ps_reg_03'] != -1]), inplace=True)
test['ps_car_11'].replace(to_replace=-1, value=stats.mode(test['ps_car_11'][test['ps_car_11'] != -1]).mode[0], inplace=True)
test['ps_car_14'].replace(to_replace=-1, value=np.mean(test['ps_car_14'][test['ps_car_14'] != -1]), inplace=True)

In [65]:
print('-----------test data--------------------')
for f in cat_variables:
    dist_values = train[f].value_counts().shape[0]
    print('Variable {} has {} distinct values'.format(f, dist_values))
print('-----------test data--------------------')
for f in cat_variables:
    dist_values = test[f].value_counts().shape[0]
    print('Variable {} has {} distinct values'.format(f, dist_values))

-----------test data--------------------
Variable ps_ind_02_cat has 5 distinct values
Variable ps_ind_04_cat has 3 distinct values
Variable ps_ind_05_cat has 8 distinct values
Variable ps_car_01_cat has 13 distinct values
Variable ps_car_02_cat has 3 distinct values
Variable ps_car_04_cat has 10 distinct values
Variable ps_car_06_cat has 18 distinct values
Variable ps_car_07_cat has 3 distinct values
Variable ps_car_08_cat has 2 distinct values
Variable ps_car_09_cat has 6 distinct values
Variable ps_car_10_cat has 3 distinct values
Variable ps_car_11_cat has 104 distinct values
-----------test data--------------------
Variable ps_ind_02_cat has 5 distinct values
Variable ps_ind_04_cat has 3 distinct values
Variable ps_ind_05_cat has 8 distinct values
Variable ps_car_01_cat has 13 distinct values
Variable ps_car_02_cat has 3 distinct values
Variable ps_car_04_cat has 10 distinct values
Variable ps_car_06_cat has 18 distinct values
Variable ps_car_07_cat has 3 distinct values
Variable p

In [9]:
# Script by https://www.kaggle.com/ogrellier
# Code: https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [10]:
train_encoded, test_encoded = target_encode(train["ps_car_11_cat"], 
                             test["ps_car_11_cat"], 
                             target=train.target, 
                             min_samples_leaf=100,
                             smoothing=10,
                             noise_level=0.01)
    
train['ps_car_11_cat_te'] = train_encoded
train.drop('ps_car_11_cat', axis=1, inplace=True)
cat_variables.remove('ps_car_11_cat')
test['ps_car_11_cat_te'] = test_encoded
test.drop('ps_car_11_cat', axis=1, inplace=True)

In [66]:
print('-----------train data--------------------')
v = cat_variables
print('Before dummification we have {} variables in train'.format(train.shape[1]))
train = pd.get_dummies(train, columns=v, drop_first=True)
print('After dummification we have {} variables in train'.format(train.shape[1]))

print('-----------test data--------------------')
v = cat_variables
print('Before dummification we have {} variables in train'.format(test.shape[1]))
test = pd.get_dummies(test, columns=v, drop_first=True)
print('After dummification we have {} variables in train'.format(test.shape[1]))

-----------train data--------------------
Before dummification we have 37 variables in train
After dummification we have 191 variables in train
-----------test data--------------------
Before dummification we have 36 variables in train
After dummification we have 190 variables in train


In [67]:
v = continuous_variables
print('-----------train data--------------------')
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
interactions = pd.DataFrame(data=poly.fit_transform(train[v]), columns=poly.get_feature_names(v))
interactions.drop(v, axis=1, inplace=True)  # Remove the original columns
# Concat the interaction variables to the train data
print('Before creating interactions we have {} variables in train'.format(train.shape[1]))
train = pd.concat([train.reset_index(drop=True), interactions], axis=1)
print('After creating interactions we have {} variables in train'.format(train.shape[1]))

print('-----------test data--------------------')
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
interactions = pd.DataFrame(data=poly.fit_transform(test[v]), columns=poly.get_feature_names(v))
interactions.drop(v, axis=1, inplace=True)  # Remove the original columns
# Concat the interaction variables to the train data
print('Before creating interactions we have {} variables in train'.format(test.shape[1]))
test = pd.concat([test.reset_index(drop=True), interactions], axis=1)
print('After creating interactions we have {} variables in train'.format(test.shape[1]))

-----------train data--------------------
Before creating interactions we have 191 variables in train
After creating interactions we have 219 variables in train
-----------test data--------------------
Before creating interactions we have 190 variables in train
After creating interactions we have 218 variables in train


In [13]:
X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']

feat_labels = X_train.columns

rf = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=-1)

rf.fit(X_train, y_train)
importances = rf.feature_importances_

indices = np.argsort(rf.feature_importances_)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,feat_labels[indices[f]], importances[indices[f]]))

 1) ps_car_11_cat_te               0.023226
 2) ps_car_13^2                    0.017592
 3) ps_reg_03 ps_car_13            0.017555
 4) ps_car_13 ps_car_15            0.017550
 5) ps_car_13                      0.017532
 6) ps_car_12 ps_car_13            0.017513
 7) ps_car_13 ps_car_14            0.017302
 8) ps_reg_01 ps_car_13            0.016725
 9) ps_reg_03 ps_car_14            0.016053
10) ps_car_14 ps_car_15            0.015640
11) ps_reg_03 ps_car_12            0.015299
12) ps_reg_03 ps_car_15            0.015265
13) ps_reg_02 ps_car_13            0.015057
14) ps_reg_01 ps_reg_03            0.014630
15) ps_reg_01 ps_car_14            0.014524
16) ps_car_13 ps_calc_02           0.014299
17) ps_car_13 ps_calc_03           0.014296
18) ps_car_13 ps_calc_01           0.014212
19) ps_car_14 ps_calc_02           0.013749
20) ps_reg_03                      0.013686
21) ps_reg_03^2                    0.013617
22) ps_reg_03 ps_calc_02           0.013608
23) ps_car_14 ps_calc_01        

In [14]:
sfm = SelectFromModel(rf, threshold='median', prefit=True)
print('Number of features before selection: {}'.format(X_train.shape[1]))
n_features = sfm.transform(X_train).shape[1]
print('Number of features after selection: {}'.format(n_features))
selected_vars = list(feat_labels[sfm.get_support()])

Number of features before selection: 162
Number of features after selection: 81


In [15]:
train = train[selected_vars + ['target']]
test = test[selected_vars]

In [68]:
train = shuffle(train).reset_index(drop=True)

In [69]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
     assert( len(actual) == len(pred) )
     all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
     all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
     totalLosses = all[:,0].sum()
     giniSum = all[:,0].cumsum().sum() / totalLosses
 
     giniSum -= (len(actual) + 1) / 2.
     return giniSum / len(actual)
def gini_normalized(a, p):
     return gini(a, p) / gini(a, a)


In [72]:
scaler = StandardScaler()
X_train = train.drop(['id','target'],axis=1)
Y_train = train['target']
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(test.drop('id',axis=1))

In [20]:
n_neighbors = [300,500,600,700,800,1000,1200]
kf = KFold(n_splits=5,random_state=1000)
gini_results = {}
auc_results={}

for n_neighbor in n_neighbors:
    knn  = KNeighborsClassifier(n_neighbors = n_neighbor,n_jobs = -1)
    for train_index, test_index in kf.split(X_train):
        knn.fit(X_train[train_index],Y_train[train_index])
        predict = knn.predict_proba(X_train[test_index])[:,1]
        auc = roc_auc_score(Y_train[test_index], predict)
        gini_score = gini_normalized(Y_train[test_index], predict)
        gini_results.setdefault(n_neighbor,[]).append(gini_score)
        auc_results.setdefault(n_neighbor,[]).append(auc)
            

In [21]:
pd.DataFrame(auc_results).describe()
#700

Unnamed: 0,300,500,600,700,800,1000,1200
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.594381,0.595479,0.595496,0.595976,0.595706,0.595146,0.594967
std,0.005818,0.005686,0.006162,0.006202,0.006847,0.006581,0.006477
min,0.589047,0.589348,0.587704,0.587593,0.586601,0.586943,0.588274
25%,0.589243,0.590963,0.592607,0.593492,0.592937,0.5925,0.59115
50%,0.59261,0.594572,0.59403,0.594816,0.594068,0.592506,0.591676
75%,0.599393,0.599885,0.599712,0.600614,0.60124,0.601169,0.60077
max,0.601612,0.602629,0.603427,0.603362,0.603686,0.60261,0.602963


In [23]:
n_neighbors = [700,720,740,760,780,800]
kf = KFold(n_splits=5,random_state=1000)
gini_results = {}
auc_results={}

for n_neighbor in n_neighbors:
    knn  = KNeighborsClassifier(n_neighbors = n_neighbor,n_jobs = -1)
    for train_index, test_index in kf.split(X_train):
        knn.fit(X_train[train_index],Y_train[train_index])
        predict = knn.predict_proba(X_train[test_index])[:,1]
        auc = roc_auc_score(Y_train[test_index], predict)
        gini_score = gini_normalized(Y_train[test_index], predict)
        gini_results.setdefault(n_neighbor,[]).append(gini_score)
        auc_results.setdefault(n_neighbor,[]).append(auc)
            

In [24]:
pd.DataFrame(auc_results).describe()
#700

Unnamed: 0,700,720,740,760,780,800
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.595976,0.595962,0.595978,0.595893,0.595818,0.595706
std,0.006202,0.006602,0.006577,0.006729,0.006647,0.006847
min,0.587593,0.587054,0.587152,0.586476,0.586759,0.586601
25%,0.593492,0.593564,0.593496,0.5938,0.593415,0.592937
50%,0.594816,0.594393,0.594396,0.594611,0.594451,0.594068
75%,0.600614,0.601,0.601124,0.600779,0.600696,0.60124
max,0.603362,0.603799,0.603724,0.603801,0.60377,0.603686


#### train['target'].shape

In [77]:
knn  = KNeighborsClassifier(n_neighbors = 700,n_jobs = -1,algorithm='kd_tree')
knn.fit(X_train,train['target'])
predict = knn.predict_proba(X_test)[:,1]

In [78]:
sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = predict
sub.to_csv('knn_submit.csv', float_format='%.6f', index=False)

In [253]:
test.shape,predict.shape

((892816, 58), (892816,))

In [80]:
data = []
for f in train.columns:
    # Defining the role
    if f == 'target':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
         
    # Defining the level
    if 'bin' in f or f == 'target':
        level = 'binary'
    elif 'cat' in f or f == 'id':
        level = 'nominal'
    elif train[f].dtype == float:
        level = 'interval'
    elif train[f].dtype == int:
        level = 'ordinal'
        
    # Initialize keep to True for all variables except for id
    
    # Defining the data type 
    dtype = train[f].dtype
    
    # Creating a Dict that contains all the metadata for the variable
    f_dict = {
        'varname': f,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns=['varname', 'role', 'level',  'dtype'])
meta.set_index('varname', inplace=True)

In [85]:
meta.T.to_csv('asdf.csv')