In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import csv
import time
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn import datasets, neighbors, linear_model, preprocessing
from sklearn.model_selection import learning_curve, ShuffleSplit
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
from sklearn.metrics import precision_recall_fscore_support
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

from collections import Counter
from imblearn import under_sampling, over_sampling
from imblearn.under_sampling import RandomUnderSampler
import xgboost
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold



In [2]:
data = pd.read_csv("train.csv")
data.head()


Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [3]:
data["target"].mean()

0.036447517859182946

# Classification

- Data normalization
- data split into train and cv set

Code for gini (evaluation per the problem description). 
- For the best accuracy, we want a normalized gini score to be as close as possible to 1. 
- The gini score for a random classifier is 0. 
- Also, if we get a negative gini score, we can just reverse the outcome to get a positive gini score of the same magnitude. 
- Gini score is a good metric for a skewed dataset such as this one. 
- For the evaluation, it needs the 'probability' from the classifier, not just the prediction (0 or 1)! 

In [3]:
#helper functions
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

#results is a list of dictionaries
def evaluate_clf(estimator, X_train, y_train, X_cv, y_cv, filename):
    estimator_name = str(estimator).split("(")[0]
    fit = estimator.fit(X_train, y_train)
    accuracy = fit.score(X_cv, y_cv)
    gini_cv = gini_normalized(y_cv, estimator.predict_proba(X_cv)[:,1])
    gini_train = gini_normalized(y_train, estimator.predict_proba(X_train)[:,1])
    precision, recall, fscore, support = precision_recall_fscore_support(pd.Series(y_cv).values, estimator.predict(X_cv), pos_label=1, average='binary')
    #print(precision)
    filename = filename + '.pkl'
    joblib.dump(estimator, filename)
    results_list = [estimator_name, filename, accuracy, gini_cv, gini_train, precision, recall, fscore]
    return results_list

Let's normalize the data using scikitlearn preprocessing. Each column will have a mean of 0 and variance of 1. 
The output of normalizing is a numpy array, so we convert it back into a pandas dataframe. Note that we will not be normalizing 'y' values. 


In [None]:
scaler = StandardScaler()
X_np = scaler.fit_transform(data.drop(["target", "id"], axis=1))
print(X_np.shape)
normalized_data = pd.DataFrame(data=X_np, columns=data.drop(["target", "id"], axis=1).columns)
normalized_data = pd.concat([normalized_data, data["target"]], axis=1)
normalized_data.describe()
train,cv=train_test_split(normalized_data,test_size = 0.1, random_state=0,stratify=normalized_data['target'])
train.describe()
X_train = train.drop("target", axis=1)
y_train = train["target"]
X_cv = cv.drop("target", axis = 1)
y_cv = cv["target"]

1. Vanilla logistic regression has a gini score of 0.2066 - much better than a random classifier.
2. The first strategy to deal with imbalanced dataset is using class_weight = "balanced" in the classifier. This will automatically give more weight to the few positives in the data. This results in a gini score of 0.22 - much better than a random classifier!

In [None]:
#trial run
estimator = LogisticRegression(C=1, class_weight = "balanced")
#estimator = LogisticRegression(C=1)
fit = estimator.fit(X_train, y_train)
score = fit.score(X_cv, y_cv)
print(score)
print(classification_report(y_cv, estimator.predict(X_cv)))
gini_cv = gini_normalized(y_cv, estimator.predict_proba(X_cv)[:,1])
gini_train = gini_normalized(y_train, estimator.predict_proba(X_train)[:,1])
print([gini_train, gini_cv])

In [None]:
cols = ['estimator_name', 'filename', 'accuracy', 'gini_cv', 'gini_train', 'precision', 'recall', 'fscore']
results_file = open('resultsfile.csv','a+')
line = ",".join(cols)
results_file.write(line + '\n')
results_file.close()
results = pd.DataFrame([], columns = cols)

In [None]:
estimators = [LogisticRegression(C = 1, class_weight = "balanced"), svm.LinearSVC(class_weight = "balanced", verbose = 2), DecisionTreeClassifier(max_depth = 3, class_weight = "balanced"),DecisionTreeClassifier(max_depth = 5, class_weight = "balanced"), RandomForestClassifier(max_depth=2, random_state=0, class_weight = "balanced", verbose = 2), KNeighborsClassifier(n_neighbors=3)]
filenames = ['logistic_C1', 'SVC_C1_linear', 'DecTree_3','DecTree_5','RandForest', 'KNeigh_3']

#for i in range(2,len(estimators)):
for i in [0,2,3,4]:
    estimator = estimators[i]
    filename = filenames[i]
    results_list =  evaluate_clf(estimator, X_train, y_train, X_cv, y_cv, filename)
    print(results_list)
    df = pd.DataFrame([results_list], columns = cols)
    results = pd.concat([results, df],ignore_index=True)
    line = str(results_list)[1:-1]
    results_file = open('resultsfile.csv','a')
    results_file.write(line + '\n')
    results_file.close()
    
print(results)

In [None]:
i=1
estimator = estimators[i]
filename = filenames[i]
results_list =  evaluate_clf(estimator, X_train, y_train, X_cv, y_cv, filename)
print(results_list)
df = pd.DataFrame([results_list], columns = cols)
results = pd.concat([results, df],ignore_index=True)

In [None]:
clf = joblib.load(results['filename'].iloc[0])

In [None]:
mbk = MiniBatchKMeans(init='k-means++', n_clusters=8, batch_size=1000,
                      n_init=10, max_no_improvement=10, verbose=2)
t0 = time.time()
mbk.fit(X_train)
t_mini_batch = time.time() - t0

# Feature reduction

In [None]:
data = pd.read_csv("train.csv")
cols_trimmed = [x for x in data.columns if 'calc' not in x]
data_1 = data[cols_trimmed].drop("id", axis = 1)
scaler = StandardScaler()
X_np = scaler.fit_transform(data_1.drop("target", axis=1))
print(X_np.shape)
norm_X_data_1 = pd.DataFrame(data=X_np, columns=data_1.drop("target", axis=1).columns)
#normalized_data = pd.concat([normalized_data, data["target"]], axis=1)
norm_X_data_1.describe()

# Visualization

In [None]:
def fit_plotpca(xdata, ydata, n_comp = 20):
    pca = PCA(n_components=n_comp, svd_solver='full', random_state=1001)
    X = pca.fit_transform(xdata)
    print("variance explained in " + str(n_comp) + " components is " + str(pca.explained_variance_ratio_.sum()))
    print(pca.explained_variance_ratio_)
    print("variance for first 2 compoenents = " + str(pca.explained_variance_ratio_[:2].sum()))

    #making the plot
    y = ydata
    colors = ['b', 'r']
    target_names = np.unique(y)
    for color, i, target_name in zip(colors, [0, 1], target_names):
    #for color, i, target_name in zip([colors[0]], [0], [target_names[0]]):
        plt.scatter(X[y == i, 0], X[y == i, 1], color=color, s=1, alpha=.8, label=target_name, marker='.')
    plt.legend(loc='best', shadow=False, scatterpoints=3)
    plt.title( "Scatter plot of the training data projected on the 1st "
        "and 2nd principal components")
    plt.xlabel("Principal axis 1 - Explains %.1f %% of the variance" % (
        pca.explained_variance_ratio_[0] * 100.0))
    plt.ylabel("Principal axis 2 - Explains %.1f %% of the variance" % (
        pca.explained_variance_ratio_[1] * 100.0))

    #plt.savefig('pca-porto-03.png', dpi=150)
    plt.show()
    return X

In [None]:
fit_plotpca(normalized_data.drop(['target'], axis = 1),data['target'].values.astype(np.int8),n_comp = 30)

In [None]:
fit_plotpca(norm_X_data_1, data_1['target'].values.astype(np.int8), n_comp = 20)

In [None]:
y = data_1['target'].values.astype(np.int8)
colors = ['b', 'r']
target_names = np.unique(y)
#for color, i, target_name in zip(colors, [0, 1], target_names):
for color, i, target_name in zip([colors[0]], [0], [target_names[0]]):
    plt.scatter(X[y == i, 0], X[y == i, 1], color=color, s=1,
                alpha=.8, label=target_name, marker='.')
plt.legend(loc='best', shadow=False, scatterpoints=3)
plt.title(
        "Scatter plot of the training data projected on the 1st "
        "and 2nd principal components")
plt.xlabel("Principal axis 1 - Explains %.1f %% of the variance" % (
        pca.explained_variance_ratio_[0] * 100.0))
plt.ylabel("Principal axis 2 - Explains %.1f %% of the variance" % (
        pca.explained_variance_ratio_[1] * 100.0))

#plt.savefig('pca-porto-03.png', dpi=150)
plt.show()

We tried to visualize the data by doing a PCA. Selecting 20 features on the original dataset after normalization explains ~50% of the variability. For the reduced and normalized dataset - data_1, we can explain 79% of the variability. In the visualization in 2D, we will only be able to see 16.7 % of the variability. 

Plotting the data in 2D shows that the points labeled 1 clearly lie in the red cluster. Bad news is, this cluster lies entirely on top of the blue cluster - there is no separation, at least in these coordinates! Perhaps this is why we were not able to reach accuracies higher than ~50%. 


We tried to visualize with t-SNE which can discover more complex features, but the code was aborted because too slow. 

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=50.0, learning_rate=200.0, n_iter=1000, n_iter_without_progress=300, random_state=0)
tsne.fit_transform(X)

# Imbalanced Learn

With the current dataset, linearSVM and kneighbors are both too slow - could not get them to run at all on my machine. Let us simplify the dataset.
1. Features - remove all features that have calc in the name - as we had seen earlier using our heatmaps, these features have 0 correlation with the target.
2. balancing the data. We will choose all of the row that have target = 1 and choose that number of rows with target = 0. So we will go from an imbalance of 1:32 to 1:1. Our data will also be a lot smaller, so should be much easier to train.

In [None]:
rus = RandomUnderSampler(ratio='majority', return_indices=False, random_state=None, replacement=False)
X_resampled, y_resampled = rus.fit_sample(data_1.drop(['target'], axis = 1), data['target'])
print(sorted(Counter(y_resampled).items()))

In [None]:
scaler = StandardScaler()
X_np = scaler.fit_transform(X_resampled)
norm_X_resampled = pd.DataFrame(data=X_np, columns=data_1.drop(["target"], axis=1).columns)
norm_Y_resampled = pd.DataFrame(data=y_resampled, columns = ['target'])
norm_data_resampled = pd.concat([norm_X_resampled, norm_Y_resampled], axis=1)
norm_data_resampled.describe()

In [None]:
train,cv=train_test_split(norm_data_resampled,test_size = 0.3, random_state=0,stratify=norm_data_resampled['target'])
X_train = train.drop("target", axis=1)
y_train = train["target"]
X_cv = cv.drop("target", axis = 1)
y_cv = cv["target"]

In [None]:
cols = ['estimator_name', 'filename', 'accuracy', 'gini_cv', 'gini_train', 'precision', 'recall', 'fscore']
results_file = open('resultsfile2.csv','a+')
line = ",".join(cols)
results_file.write(line + '\n')
results_file.close()
results = pd.DataFrame([], columns = cols)

In [None]:
from sklearn.calibration import CalibratedClassifierCV
estimators = [CalibratedClassifierCV(svm.LinearSVC(class_weight = "balanced", verbose = 2)), DecisionTreeClassifier(max_depth = 7, class_weight = "balanced"),DecisionTreeClassifier(max_depth = 9, class_weight = "balanced"),KNeighborsClassifier(n_neighbors=5)]
filenames = ['SVC_C1_linear_1', 'DecTree_7_1','DecTree_9_1','KNeigh_5_1']
estimators = [GaussianNB()]
filenames = ['GaussianNB_1']
estimators = [LogisticRegression(C = 1, class_weight = "balanced"), CalibratedClassifierCV(svm.LinearSVC(class_weight = "balanced", verbose = 2)), DecisionTreeClassifier(max_depth = 3, class_weight = "balanced"),DecisionTreeClassifier(max_depth = 5, class_weight = "balanced"), RandomForestClassifier(max_depth=2, random_state=0, class_weight = "balanced", verbose = 2), KNeighborsClassifier(n_neighbors=3)]
filenames = ['logistic_C1_1', 'SVC_C1_linear_1', 'DecTree_3_1','DecTree_5_1','RandForest_1', 'KNeigh_3_1']
#for i in range(0,len(estimators)):
#for i in [0,2,3,4,5]:
for i in range(0,1):
    estimator = estimators[i]
    filename = filenames[i]
    results_list =  evaluate_clf(estimator, X_train, y_train, X_cv, y_cv, filename)
    print(results_list)
    df = pd.DataFrame([results_list], columns = cols)
    results = pd.concat([results, df],ignore_index=True)
    line = str(results_list)[1:-1]
    results_file = open('resultsfile2.csv','a')
    results_file.write(line + '\n')
    results_file.close()
    
print(results)

Let's try visualizations again on this reduced dataset. 

In [None]:
X = fit_plotpca(norm_data_resampled.drop("target", axis = 1),norm_data_resampled['target'].values.astype(np.int8), 33)

This still looks like a decent representation of the original dataset, but it is much smaller in size. We have gone from 600,000 to 40,000. Let's try to reduce this further to size of 5000 by undersampling both classes so that we can quickly try a variety of nonlinear models or non linear features. 

In [None]:
choose = np.random.randint(0, norm_data_resampled.shape[0]-1, size=40000)
norm_data_resampled_2 = norm_data_resampled.iloc[choose[:20000]]
norm_data_resampled_2_cv = norm_data_resampled.iloc[choose[20000:]]
n_comp = 33
X = fit_plotpca(norm_data_resampled_2.drop("target", axis = 1),norm_data_resampled_2['target'].values.astype(np.int8), n_comp)
y = norm_data_resampled_2['target'].values.astype(np.int8)

We are at 5000 datapoints and the data still looks somewhat similar in PCA! Now we can try some fancy nonlinear stuff. 

# Non-linear features/algorithms

Attempt non-linear SVM with Kernel trick on the reduced dataset of size 5000. 

In [None]:
#results = pd.DataFrame([], columns = cols)
cols = ['estimator_name', 'filename', 'accuracy', 'gini_cv', 'gini_train', 'precision', 'recall', 'fscore']
estimators = [svm.SVC(C=1.0, kernel='rbf', gamma=0.01, probability = True), svm.SVC(C=1.0, kernel='rbf', gamma=0.1, probability = True), svm.SVC(C=1.0, kernel='rbf', gamma=100, probability = True)]
filenames = ['SVM_C1_rbf0.01','SVM_C1_rbf0.1', 'SVM_C1_rbf100']
#for i in range(0,len(estimators)):
#for i in [0,2,3,4,5]:
for i in range(0,1):
    estimator = estimators[i]
    filename = filenames[i]
    t0 = time.time()
    results_list =  evaluate_clf(estimator, norm_data_resampled_2.drop('target', axis = 1), norm_data_resampled_2['target'], norm_data_resampled_2_cv.drop('target', axis = 1), norm_data_resampled_2_cv['target'], filename)
    t = time.time() - t0
    print("time = " + str(t) + " s.")
    print(results_list)
    df = pd.DataFrame([results_list], columns = cols)
    results = pd.concat([results, df],ignore_index=True)
    line = str(results_list)[1:-1]
    results_file = open('resultsfile2.csv','a')
    results_file.write(line + '\n')
    results_file.close()
    
    
print(results)

From the first 33 principal components, of the dataset of size 5000, I will generate 33C2 additional features = 528 additional features. Then I will try PCA again.

In [None]:
import itertools
iterator = list(itertools.combinations(range(0,n_comp), 2))

In [None]:
X_expand = X
total = X.shape[1] + len(iterator)
for cols in iterator:
    #cols = iterator[1]
    feature = np.reshape(X[:,cols[0]]*X[:,cols[1]], (-1,1))
    X_expand = np.append(X_expand, feature, axis = 1)
X_expand.shape
#renormalize data
scaler = StandardScaler()
X_expand = scaler.fit_transform(X_expand)

In [None]:
X_expand = pd.DataFrame(data = X_expand, columns = range(0, total))
X_expand.describe()

In [None]:
X = fit_plotpca(X_expand,norm_data_resampled_2['target'].values.astype(np.int8), 100)

In [None]:
X[y == 0, 0].shape

In [None]:
    
    y = norm_data_resampled_2['target'].values.astype(np.int8)
    colors = ['b', 'r']
    target_names = np.unique(y)
    for color, i, target_name in zip(colors, [0, 1], target_names):
    #for color, i, target_name in zip([colors[0]], [0], [target_names[0]]):
        plt.scatter(X[y == i, 0], X[y == i, 1], color=color, s=1, alpha=0.8, label=target_name, marker='.')
    plt.legend(loc='best', shadow=False, scatterpoints=3)
    plt.title( "Scatter plot of the training data projected on the 1st "
        "and 2nd principal components")
    plt.xlabel("Principal axis 1 - Explains %.1f %% of the variance" % (
        pca.explained_variance_ratio_[0] * 100.0))
    plt.ylabel("Principal axis 2 - Explains %.1f %% of the variance" % (
        pca.explained_variance_ratio_[1] * 100.0))
    plt.axis([-5,5,-5,5])
    #plt.savefig('pca-porto-03.png', dpi=150)
    plt.show()

This doesn't look like it helped at all in separating the two classes. We will stick to 33 features from the original PCA to try out other classification algos. 

In [None]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(kernel="poly", n_jobs = -1)
X_kpca = kpca.fit_transform(norm_data_resampled_2.drop("target", axis = 1))

In [None]:
X_polypca = X_kpca
reds = y == 0
blues = y == 1
plt.figure()
plt.subplot(1, 1, 1, aspect='equal')
plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=40)
plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=20)
plt.title("Projection by KPCA")
plt.xlabel("1st principal component in space induced by $\phi$")
plt.ylabel("2nd component")
#plt.subplots_adjust(0.02, 0.10, 0.98, 0.94, 0.04, 0.35)

plt.show()

The first attempt at kPCA did not yield much.

In [None]:
kpca = KernelPCA(kernel="sigmoid", n_jobs = -1)
X_kpca = kpca.fit_transform(norm_data_resampled_2.drop("target", axis = 1))
X_sigmoidpca = X_kpca
reds = y == 0
blues = y == 1
plt.figure()
plt.subplot(1, 1, 1, aspect='equal')
plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=40)
plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=20)
plt.title("Projection by KPCA")
plt.xlabel("1st principal component in space induced by $\phi$")
plt.ylabel("2nd component")
#plt.subplots_adjust(0.02, 0.10, 0.98, 0.94, 0.04, 0.35)

plt.show()

Sigmoid kernel gives 2 clouds, one pretty much on top of the other. We are looking for separation, so this doesn't help. 

In [None]:
kpca = KernelPCA(kernel="rbf", n_jobs = -1)
X_kpca = kpca.fit_transform(norm_data_resampled_2.drop("target", axis = 1))
X_gausspca = X_kpca
reds = y == 0
blues = y == 1


The gaussian kernel seems like it helped somewhat, compared to what we saw for the sigmoid kernel. Perhaps some more tuning might help get better separation? First, let's replot this same one with smaller point sizes so we can more clearly see if there is really a separation. 

In general, I see more reds towards the left of the figure than to the right. 

In [None]:
plt.figure()
plt.subplot(1, 1, 1, aspect='equal')
plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=10)
plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=5)
plt.title("Projection by KPCA")
plt.xlabel("1st principal component in space induced by $\phi$")
plt.ylabel("2nd component")
#plt.subplots_adjust(0.02, 0.10, 0.98, 0.94, 0.04, 0.35)

plt.show()

In [None]:
plt.figure()
plt.subplot(1, 1, 1, aspect='equal')
plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=10)
plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=5)
plt.title("Projection by KPCA")
plt.xlabel("1st principal component in space induced by $\phi$")
plt.ylabel("2nd component")
#plt.subplots_adjust(0.02, 0.10, 0.98, 0.94, 0.04, 0.35)

plt.show()

In [None]:
gammas = [1, 10, 20] 
for g in gammas:
    kpca = KernelPCA(kernel="rbf", n_jobs = -1, gamma = g )
    X_kpca = kpca.fit_transform(norm_data_resampled_2.drop("target", axis = 1))
    reds = y == 0
    blues = y == 1
    plt.figure()
    plt.subplot(1, 1, 1, aspect='equal')
    plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=10)
    plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=5)
    plt.title("Projection by KPCA, gamma = " + str(g))
    plt.xlabel("1st principal component in space induced by $\phi$")
    plt.ylabel("2nd component")
#plt.subplots_adjust(0.02, 0.10, 0.98, 0.94, 0.04, 0.35)
    plt.show()


In [None]:
X_pca_20 = X_kpca
plt.figure()
plt.subplot(1, 1, 1, aspect='equal')
plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=10)
plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=5)
plt.title("Projection by KPCA, gamma = " + str(g))
plt.xlabel("1st principal component in space induced by $\phi$")
plt.ylabel("2nd component")
plt.axis([-.0025, 0.0025, -0.0025, 0.0025])
#plt.subplots_adjust(0.02, 0.10, 0.98, 0.94, 0.04, 0.35)
plt.show()

In [None]:
len(kpca.lambdas_)

In [None]:
gammas = [.001, .1, 50]
kpca = [0,0,0]
for g,i in zip(gammas, range(0,3)):
    kpca[i] = KernelPCA(kernel="rbf", n_jobs = -1, gamma = g )
    X_kpca = kpca[i].fit_transform(norm_data_resampled_2.drop("target", axis = 1))
    reds = y == 0
    blues = y == 1
    plt.figure()
    plt.subplot(1, 1, 1, aspect='equal')
    plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=10)
    plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=5)
    plt.title("Projection by KPCA, gamma = " + str(g))
    plt.xlabel("1st principal component in space induced by $\phi$")
    plt.ylabel("2nd component")
#plt.subplots_adjust(0.02, 0.10, 0.98, 0.94, 0.04, 0.35)
    plt.show()

In [None]:
for g,i in zip(gammas, range(0,3)):
    #kpca[i] = KernelPCA(kernel="rbf", n_jobs = -1, gamma = g )
    X_kpca = kpca[i].transform(norm_data_resampled_2.drop("target", axis = 1))
    reds = y == 0
    blues = y == 1
    plt.figure()
    plt.subplot(1, 1, 1, aspect='equal')
    plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=10)
    #plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=5)
    plt.title("Projection by KPCA, gamma = " + str(g))
    plt.xlabel("1st principal component in space induced by $\phi$")
    plt.ylabel("2nd component")
#plt.subplots_adjust(0.02, 0.10, 0.98, 0.94, 0.04, 0.35)
    plt.show()

In [None]:
gammas = [20]
kpca = [0,0,0]
for g,i in zip(gammas, range(0, len(gammas))):
    kpca[i] = KernelPCA(kernel="rbf", n_jobs = -1, gamma = g, n_components = 100 )
    X_kpca = kpca[i].fit_transform(norm_data_resampled_2.drop("target", axis = 1))
    X_back = kpca.inverse_transform(X_kpca)
    reds = y == 0
    blues = y == 1
    plt.figure()
    plt.subplot(2, 2, 1, aspect='equal')
    plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=10)
    plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=5)
    plt.title("Projection by KPCA, gamma = " + str(g))
    plt.xlabel("1st principal component in space induced by $\phi$")
    plt.ylabel("2nd component")
#plt.subplots_adjust(0.02, 0.10, 0.98, 0.94, 0.04, 0.35)
    plt.show()

# Pipeline

In [4]:
def drop_cols(X):
    cols_trimmed = [i for i in range(0, len(data.drop('target', axis = 1).columns)) if 'calc' not in data.columns[i] and 'id' not in data.columns[i]]
    X_trimmed = X[:, cols_trimmed]
    return X_trimmed


In [None]:
#pipe = make_pipeline(FunctionTransformer(drop_cols), StandardScaler(), svm.SVC(kernel='rbf', probability = True)) 
pipe = make_pipeline(FunctionTransformer(drop_cols), StandardScaler(), LogisticRegression(class_weight = "balanced")) 
param_grid = dict(logisticregression__C = [0.01, 1, 100])
grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='roc_auc', refit=True, cv=5, verbose=2)

In [None]:
grid_search.fit(data.drop('target', axis = 1),data['target'])
grid_search.best_estimator_.steps[2]
resultsdf = pd.DataFrame(grid_search.cv_results_)
resultsdf

In [None]:
predicted = grid_search.predict_proba(data.drop('target', axis = 1))
gini = gini_normalized(data['target'], predicted)

In [6]:
clf = xgboost.XGBClassifier()
pipe2 = make_pipeline(FunctionTransformer(drop_cols), clf) 
pipe2.steps

[('functiontransformer', FunctionTransformer(accept_sparse=False,
            func=<function drop_cols at 0x1a173366a8>, inv_kw_args=None,
            inverse_func=None, kw_args=None, pass_y='deprecated',
            validate=True)),
 ('xgbclassifier',
  XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
         gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
         min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
         objective='binary:logistic', reg_alpha=0, reg_lambda=1,
         scale_pos_weight=1, seed=0, silent=True, subsample=1))]

In [8]:
from sklearn.model_selection import StratifiedKFold
#param_grid = dict(xgbclassifier__n_estimators = range(50, 400, 50), xgbclassifier__max_depth = [3,5,7], scale_pos_weight=[25, 35, 40])
#kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
param_grid = dict(xgbclassifier__n_estimators = range(50, 100, 50), xgbclassifier__max_depth = [3], xgbclassifier__scale_pos_weight=[25])
kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=7)
grid_search = GridSearchCV(pipe2, param_grid, scoring="roc_auc", n_jobs=-1, cv=kfold, refit=True, verbose=2)
grid_search.fit(data.drop('target', axis = 1),data['target'])

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50, xgbclassifier__scale_pos_weight=25 
[CV] xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50, xgbclassifier__scale_pos_weight=25 
[CV]  xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50, xgbclassifier__scale_pos_weight=25, total=   6.8s
[CV]  xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50, xgbclassifier__scale_pos_weight=25, total=   6.9s


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    8.9s finished


IndexError: list index out of range

In [9]:
grid_search.best_estimator_.steps[1]
resultsdf2 = pd.DataFrame(grid_search.cv_results_)
resultsdf2



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_xgbclassifier__max_depth,param_xgbclassifier__n_estimators,param_xgbclassifier__scale_pos_weight,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,5.98209,0.874085,0.633282,0.645942,3,50,25,"{'xgbclassifier__max_depth': 3, 'xgbclassifier...",1,0.633133,0.646222,0.633432,0.645661,0.175571,0.130244,0.000149,0.000281


In [10]:
predicted = grid_search.predict_proba(data.drop('target', axis = 1))
gini = gini_normalized(data['target'], predicted)

In [11]:
gini

-0.28171316094087606

In [None]:
param_grid = dict(xgbclassifier__n_estimators = range(50, 400, 50), xgbclassifier__max_depth = [3,5,7], xgbclassifier__scale_pos_weight=[25, 35, 40])
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
#param_grid = dict(xgbclassifier__n_estimators = range(50, 100, 50), xgbclassifier__max_depth = [3], scale_pos_weight=[25])
#kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=7)
grid_search = GridSearchCV(pipe2, param_grid, scoring="roc_auc", n_jobs=-1, cv=kfold, refit=True, verbose=2)
grid_search.fit(data.drop('target', axis = 1),data['target'])
grid_search.best_estimator_.steps[1]
resultsdf3 = pd.DataFrame(grid_search.cv_results_)
resultsdf3

Fitting 5 folds for each of 63 candidates, totalling 315 fits
[CV] xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50, xgbclassifier__scale_pos_weight=25 
[CV] xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50, xgbclassifier__scale_pos_weight=25 
[CV] xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50, xgbclassifier__scale_pos_weight=25 
[CV] xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50, xgbclassifier__scale_pos_weight=25 
[CV] xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50, xgbclassifier__scale_pos_weight=25 
[CV] xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50, xgbclassifier__scale_pos_weight=35 
[CV] xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50, xgbclassifier__scale_pos_weight=35 
[CV] xgbclassifier__max_depth=3, xgbclassifier__n_estimators=50, xgbclassifier__scale_pos_weight=35 


In [None]:
predicted = grid_search.predict_proba(data.drop('target', axis = 1))
gini = gini_normalized(data['target'], predicted)

# Making predictions on test data

In [None]:
test = pd.read_csv('test.csv')
cols_trimmed = [x for x in test.columns if 'calc' not in x and 'id' not in x]
test_1 = test[cols_trimmed]
scaler = StandardScaler()
X_np = scaler.fit_transform(test_1)
norm_test = pd.DataFrame(data=X_np, columns=cols_trimmed)
clf = joblib.load('SVM_C1_rbf0.01.pkl')
t0= time.time()
y = clf.predict_proba(norm_test)
t = time.time()-t0
print(t)

In [None]:
test = pd.read_csv('test.csv')
y = grid_search.predict_proba(test)
df = pd.DataFrame.from_items([("target", y[:,1])])
test_results = pd.concat([test["id"], df], axis = 1)
test_results.to_csv("submission_logistic.csv", index = False)