In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
#from mlxtend.classifier import StackingClassifier

  from numpy.core.umath_tests import inner1d


In [2]:
df_train = pd.read_csv('train.csv', index_col=0)
df_test = pd.read_csv('test.csv', index_col=0)

In [3]:
# replace missing values with median (less sensitive to outliers)
df_train['opened_position_qty '].fillna(df_train['opened_position_qty '].median(),inplace=True)
df_test['opened_position_qty '].fillna(df_train['opened_position_qty '].median(),inplace=True)
df_train['closed_position_qty'].fillna(df_train['closed_position_qty'].median(),inplace=True)
df_test['closed_position_qty'].fillna(df_train['closed_position_qty'].median(), inplace=True)

In [4]:
# Normalize data
from sklearn import preprocessing

x_train = df_train[df_train.columns[:26]]
y_train = df_train['y']
x_test = df_test

# Normalize training data by subtracting mean and scaling to unit variance
std_scale = preprocessing.StandardScaler().fit(x_train)
x_train_norm = std_scale.transform(x_train)
x_train = pd.DataFrame(x_train_norm, index=x_train.index, columns=x_train.columns)

# Normalize testing data by using mean and SD of training set
x_test_norm = std_scale.transform(x_test)
x_test = pd.DataFrame(x_test_norm, index=x_test.index, columns=x_test.columns) 

In [5]:
# Split training and validation data 
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)

In [7]:
model1 = DecisionTreeClassifier()
model2 = KNeighborsClassifier()
model3= LogisticRegression()

model1.fit(x_train,y_train)
model2.fit(x_train,y_train)
model3.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
print(model1.score(x_val,y_val))
print(model2.score(x_val,y_val))

0.581189439211317
0.6232148283196597


In [9]:
print(model3.score(x_val,y_val))

0.6487136635267903


In [22]:
from sklearn.ensemble import BaggingClassifier

boosting = AdaBoostClassifier(base_estimator=model3, n_estimators=64)
boosting.fit(x_train, y_train)
print(boosting.score(x_train, y_train))
print(boosting.score(x_val, y_val))

0.6435944832708734
0.6450842364698336


In [23]:
from sklearn.metrics import roc_curve, auc

y_pred = boosting.predict(x_val)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_val, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('val auc: {}'.format(roc_auc))

val auc: 0.5104467025605821


In [12]:
boosting2 = AdaBoostClassifier(base_estimator=model1, n_estimators=10)   
boosting2.fit(x_train, y_train)
print(boosting2.score(x_train, y_train))
print(boosting2.score(x_val, y_val))

0.9964655288834869
0.5966946892197575


In [14]:
y_pred = boosting2.predict(x_val)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_val, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('val auc: {}'.format(roc_auc))

val auc: 0.5453643529772931


In [6]:
model4 = RandomForestClassifier(n_estimators=64)

model4.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=64, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [7]:
from sklearn.externals import joblib
filename = 'random_forest.joblib.pkl'
_ = joblib.dump(model4, filename, compress=9)

In [None]:
boosting3 = AdaBoostClassifier(base_estimator=model4, n_estimators=30)   
boosting3.fit(x_train, y_train)
print(boosting3.score(x_train, y_train))
print(boosting3.score(x_val, y_val))

In [None]:
y_pred = boosting3.predict(x_val)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_val, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('val auc: {}'.format(roc_auc))

In [None]:


filename = 'random_forest_boosted.joblib.pkl'
_ = joblib.dump(boosting3, filename, compress=9)
#clf2 = joblib.load(filename)

In [27]:
df_test['Predicted'] = boosting3.predict_proba(x_test)[:,1]
df_test[['Predicted']].to_csv('submission_model7.csv')

In [None]:
boosting4 = AdaBoostClassifier(base_estimator=boosting3, n_estimators=10)   
boosting4.fit(x_train, y_train)
print(boosting4.score(x_train, y_train))
print(boosting4.score(x_val, y_val))