In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn import metrics

%matplotlib inline

In [3]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/valid.csv')

In [4]:
# assigning X,y
y_train = train['TARGET']
X_train = pd.get_dummies(train.drop(['TARGET','SK_ID_CURR'], axis=1), dummy_na=True)

In [5]:
gboost_optimized = GradientBoostingClassifier(n_estimators=500,
                                             learning_rate=0.1,
                                             min_samples_split=15000,
                                             min_samples_leaf=1000,
                                             max_depth=5,
                                             subsample=1.0,
                                             max_features=150)

In [None]:
gboost_optimized.fit(X_train.fillna(value=0), y_train)

In [None]:
predictions = gboost_optimized.predict(X_test.fillna(0))
pred_prob = gboost_optimized.predict_proba(X_test.fillna(0))

In [None]:
print(classification_report(y_test, predictions))

In [None]:
print(confusion_matrix(y_test, predictions))

In [None]:
roc_auc_score(y_test, pred_prob[:,1])

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, pred_prob[:,1])

In [None]:
plt.figure(figsize=(16,8))
plt.title('Receiver Operating Characteristic')
plt.plot(fpr,tpr, label='AUC = %0.4f'% roc_auc_score(y_test, pred_prob[:,1]))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='upper left')

In [None]:
summary = pd.Series(gboost_optimized.feature_importances_)
summary.index = X.columns
summary.sort_values(ascending=False, inplace=True)

plt.figure(figsize=(10,10))
sns.barplot(y=summary[:50].index.values, x=summary[:50].values)

In [79]:
test_csv = pd.read_csv('../input/application_test.csv.zip', compression='infer')

In [80]:
X = test_csv
X.set_index('SK_ID_CURR',inplace=True)

In [83]:
X = pd.get_dummies(X.drop(['SK_ID_CURR'], axis=1), dummy_na=True)

In [84]:
probs = gboost_optimized.predict_proba(X.fillna(0))

In [85]:
submission = pd.DataFrame()
submission['TARGET'] = probs[:,1]
submission.index = test_csv.index

In [86]:
submission.to_csv('gboost-optimized-rev-1.csv')