In [148]:
import numpy as np
import os
import pandas as pd
import sys
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, BayesianRidge, Lasso, HuberRegressor, ElasticNetCV
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
np.random.seed(25)

In [149]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [150]:
train.head()

Unnamed: 0,portfolio_id,desk_id,office_id,pf_category,start_date,sold,country_code,euribor_rate,currency,libor_rate,bought,creation_date,indicator_code,sell_date,type,hedge_value,status,return
0,PF00001002,DSK00001001,OFF00001002,B,20040720,110000000.0,T,0.02074,USD,2.332216,109809700.0,20040720,,20040812,B,,,0.02496
1,PF00001003,DSK00001002,OFF00001001,A,20040709,176671000.0,N,0.02074,GBP,5.269617,176008400.0,20040723,,20040812,C,,,0.05496
2,PF00001005,DSK00001004,OFF00001001,A,20040723,56474000.0,T,0.02074,USD,2.332216,56379530.0,20040723,,20040817,A,,,0.02496
3,PF00001006,DSK00001005,OFF00001001,A,20040609,164813000.0,T,0.02074,USD,2.332216,164508800.0,20040723,,20040713,A,,,0.02496
4,PF00001007,DSK00001005,OFF00001002,B,20040609,140800000.0,T,0.02074,USD,2.332216,140540200.0,20040723,,20040713,B,,,0.02496


In [151]:
train['country_code'].value_counts()

M    5307
T    2955
N     562
Z     481
U      61
Name: country_code, dtype: int64

In [152]:
train = train.fillna({"indicator_code": True, "hedge_value":False, "status":True, "sold": train["sold"].mean(), 
                     "libor_rate": train["libor_rate"].mean(), "bought": train["bought"].mean()})
test = test.fillna({"indicator_code": True, "hedge_value":False, "status":True, "sold": test["sold"].mean(), 
                     "libor_rate": test["libor_rate"].mean(), "bought": test["bought"].mean()})

In [153]:
# train['sold'] = np.log1p(train["sold"])
# train['bought'] = np.log1p(train["bought"])

# test['sold'] = np.log1p(test["sold"])
# test['bought'] = np.log1p(test["bought"])


In [154]:
# One-hot encoding
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
train["office_id"] = lb_make.fit_transform(train["office_id"])
train["pf_category"] = lb_make.fit_transform(train["pf_category"])
train["country_code"] = lb_make.fit_transform(train["country_code"])
train["currency"] = lb_make.fit_transform(train["currency"])
train["indicator_code"] = lb_make.fit_transform(train["indicator_code"])
train["type"] = lb_make.fit_transform(train["type"])
train["hedge_value"] = lb_make.fit_transform(train["hedge_value"])
train["status"] = lb_make.fit_transform(train["status"])

test["office_id"] = lb_make.fit_transform(test["office_id"])
test["pf_category"] = lb_make.fit_transform(test["pf_category"])
test["country_code"] = lb_make.fit_transform(test["country_code"])
test["currency"] = lb_make.fit_transform(test["currency"])
test["indicator_code"] = lb_make.fit_transform(test["indicator_code"])
test["type"] = lb_make.fit_transform(test["type"])
test["hedge_value"] = lb_make.fit_transform(test["hedge_value"])
test["status"] = lb_make.fit_transform(test["status"])


In [155]:
feature_names = [x for x in train.columns if x not in ['portfolio_id','desk_id','return','sell_date','creation_date','start_date']]
target = train['return']

In [156]:
#model = ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000)

model = GradientBoostingRegressor(n_estimators=7000, learning_rate=0.05, max_depth=3, max_features='sqrt',
                                               min_samples_leaf=15, min_samples_split=10, loss='huber')


In [157]:
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer, r2_score
print(cross_val_score(model, train[feature_names], target, cv=5, scoring=make_scorer(r2_score)))

[-1.18427093  0.97408497  0.96990871  0.36985722  0.7963112 ]


In [158]:
model.fit(train[feature_names], target)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='huber', max_depth=3,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=15,
             min_samples_split=10, min_weight_fraction_leaf=0.0,
             n_estimators=7000, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [159]:
## make prediction
pred = model.predict(test[feature_names])

In [160]:
## make submission
sub = pd.DataFrame()
sub['portfolio_id'] = test['portfolio_id']
sub['return'] = pred
sub.to_csv('annual_returns.csv', index=False)