In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import pickle

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#############
# load data #
#############

data_types = {
    'time_id': 'int16',
    'investment_id': 'int16',
    "target": 'float16',
}

features = [f'f_{i}' for i in range(300)]
for f in features:
    data_types[f] = 'float16'
    
target = 'target'
def load_data(file_name, data_types, nrows):
    return pd.read_csv(file_name, usecols = data_types.keys()
                       , dtype=data_types, index_col = 0, nrows = nrows)
train = load_data('/kaggle/input/ubiquant-market-prediction/train.csv', data_types, nrows = 1200000)

In [None]:
# time ids
print("Unique time_id's: ",len(train.index.unique()))
train.groupby(['time_id']).head()

In [None]:
###################
# sample analysis #
###################

import seaborn as sns
import matplotlib.pyplot as plt


# investment id grouped analysis
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
targets = train.groupby(['investment_id'])['target'].count()
targets.plot.hist(bins=100)
plt.title("frequency by investment_id distribution")
plt.show()

mean_target = train.groupby(['investment_id'])['target'].mean()
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
mean_target.plot.hist(bins=100)
plt.title("mean target distribution")
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
std_target = train.groupby(['investment_id'])['target'].std()
std_target.plot.hist(bins=100)
plt.title("standard deviation of target distribution")
plt.show()

In [None]:
ax = sns.jointplot(x=targets, y=mean_target, kind="reg", 
                   height=8, joint_kws={'line_kws':{'color':'red'}})
ax.ax_joint.set_xlabel('observations of investments')
ax.ax_joint.set_ylabel('mean target value')
plt.show()

qx = sns.jointplot(x=targets.values, y=std_target, kind="reg", 
                   height=8, joint_kws={'line_kws':{'color':'red'}})
ax.ax_joint.set_xlabel('observations of investments')
ax.ax_joint.set_ylabel('std target value')
plt.show()

In [None]:
# same for time_id, but not obvious pattern is drawn
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
targets = train.groupby(['time_id'])['target'].count()
targets.plot.hist(bins=100)
plt.title("frequency by time_id distribution")
plt.show()

mean_target = train.groupby(['time_id'])['target'].mean()
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
mean_target.plot.hist(bins=100)
plt.title("mean target distribution")
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
std_target = train.groupby(['time_id'])['target'].std()
std_target.plot.hist(bins=100)
plt.title("standard deviation of target distribution")
plt.show()

In [None]:
ax = sns.jointplot(x=targets, y=mean_target, kind="reg", 
                   height=8, joint_kws={'line_kws':{'color':'red'}})
ax.ax_joint.set_xlabel('observations of time_id')
ax.ax_joint.set_ylabel('mean target value')
plt.show()

qx = sns.jointplot(x=targets.values, y=std_target, kind="reg", 
                   height=8, joint_kws={'line_kws':{'color':'red'}})
ax.ax_joint.set_xlabel('observations of time_id')
ax.ax_joint.set_ylabel('mean target value')
plt.show()

In [None]:
# overall plot with time
plt.subplot(3, 1, 2)
train.groupby('time_id')['target'].mean().plot()
plt.title("average target by time")
plt.axhline(y=np.mean(mean_target), color='r', label="mean")
plt.show()

plt.subplot(3, 1, 3)
train.groupby('time_id')['target'].std().plot()
plt.title("std of target by time")
plt.axhline(y=np.mean(std_target), color='r', label="mean")
plt.show()

In [None]:
# histogram of unique feature values
train[features].nunique().hist()

In [None]:
# feature correlation with target
invest = train.investment_id.copy()
invest_corrs = list()
for col in features:
    corr = np.corrcoef(invest, train[col])[0][1]
    invest_corrs.append(corr)
del(invest)

invest_importances = pd.Series(invest_corrs, index=features)
invest_importances.nlargest(30).plot(kind='barh', figsize=(12, 6)).invert_yaxis()
plt.show()

In [None]:
target = train.target.copy()
corrs = list()
for col in features:
    corr = np.corrcoef(target, train[col])[0][1]
    corrs.append(corr)
del(target)

feat_importances = pd.Series(corrs, index=features)
feat_importances.nlargest(300).plot(kind='barh', figsize=(12, 24)).invert_yaxis()
plt.show()

In [None]:
feat_importances.nlargest(30).plot(kind='barh', figsize=(12, 6)).invert_yaxis()
plt.show()

In [None]:
# feature correlation with investment id and percentage of overlaps
selected_features = feat_importances[feat_importances > 0]
selected_features_invest = invest_importances[invest_importances > 0]

overlap = 0
for i in selected_features.keys():
    if i in selected_features_invest.keys():
        overlap += 1
percent_overlap = overlap/len(selected_features)
percent_overlap

In [None]:
from sklearn.model_selection import TimeSeriesSplit
import lightgbm
from lightgbm import LGBMRegressor

def corr(a, b, w):
    cov = lambda x, y: np.sum(w * (x - np.average(x, weights=w)) * (y - np.average(y, weights=w))) / np.sum(w)
    return cov(a, b) / np.sqrt(cov(a, a) * cov(b, b))

def corr_metric(labels, preds):
    return 'corr', corr(labels, preds, np.ones(len(labels))), True

def mse(a, b, w):
    mse = lambda x, y: np.sum((x-y)**2)/a.shape[0]
    return mse(a, b)

def mse_metric(labels, preds):
    return 'mse', corr(labels, preds, np.ones(len(labels))), True

def time_split_cross_validation(train_data, features, target, select = True, infer = False):
    folder = f'select_' if select else f'all_'
    if infer:
        models = dict()
        fold = 0
        for fold in range(10):
            model_path = f'/kaggle/working/select_lgbm_fold{fold}.pkl'
            models[fold] = pickle.load(open(model_path, 'rb'))
            print(f'loaded {model_path}')
        return models
    else:
        models = dict()
        corr_scores = dict()
        l2 = dict()

        tscv = TimeSeriesSplit(max_train_size=None, n_splits=10)

        for fold, (train_index, test_index) in enumerate(tscv.split(train_data)):

            train = train_data.iloc[train_index]
            valid = train_data.iloc[test_index]

            lgbm = LGBMRegressor(
                num_leaves=2 ** np.random.randint(3, 8),
                learning_rate = 10 ** (-np.random.uniform(0.1,2)),
                n_estimators = 2000,
                min_child_samples = 1000, 
                subsample=np.random.uniform(0.5,1.0), 
                subsample_freq=1   
            )

            lgbm.fit(train[features], train[target]
                     , eval_set = (valid[features], valid[target])
                     , eval_metric = corr_metric, verbose= False)
            preds = lgbm.predict(valid[features])

            models[fold] = lgbm
            if select: pickle.dump(lgbm, open(f'select_lgbm_fold{fold}.pkl', 'wb'))
            else: pickle.dump(lgbm, open(f'all_lgbm_fold{fold}.pkl', 'wb'))
            
            corr_scores[fold] = np.corrcoef(valid[target], preds)[0][1]
            l2[fold] = np.linalg.norm(valid[target] - preds)
            print('Finished a fold')
        return models, corr_scores, l2

In [None]:
#models = time_split_cross_validation(train[:int(train.shape[0]*0.9)], selected_features.index, 
#                                    target = 'target', select = True, infer = True)
models, scores, loss = time_split_cross_validation(train[:int(train.shape[0]*0.9)], 
                                                   selected_features.index, target = 'target')

In [None]:
def eval_models(models, test_df, selected_features):
    preds = np.zeros((len(test_df)))
    for model in models.values():
        preds += model.predict(test_df[selected_features]) / len(models)
    return preds, np.sum((preds-test_df['target'].values)**2) / len(test_df)

test_start_index = int(train.shape[0]*0.9)
test = train[test_start_index:]
pred, acc = eval_models(models, test, selected_features.index)
test['pred'] = pred
acc

In [None]:
real_y = [test.iloc[i]['target'] for i in range(len(pred))]

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(18, 12))
plt.plot(pred[:500], color = 'r', label = 'predicted', linewidth=0.5)
plt.legend()
plt.plot(real_y[:500], color = 'g', label = 'real', linewidth=0.5)
plt.legend()
plt.ylabel('target')
plt.xlabel('testing sample index')
plt.savefig('9.png')
plt.show()

In [None]:
train1 = train.copy()
models1, scores1, loss1 = time_split_cross_validation(train1[:int(train1.shape[0]*0.9)], features, target = 'target', select = False)

In [None]:
test_start_index1 = int(train1.shape[0]*0.9)
test1 = train1.iloc[test_start_index1:]
pred1, acc1 = eval_models(models1, test1, features)
test1['pred'] = pred1
acc1

In [None]:
# predicted data analysis - std and mean
fig, ax = plt.subplots(1, 1, figsize=(18, 12))
plt.plot(pred1[:500], color = 'r', label = 'predicted', linewidth=0.5)
plt.legend()
plt.plot(real_y[:500], color = 'g', label = 'real', linewidth=0.5)
plt.legend()
plt.ylabel('target')
plt.xlabel('testing sample index')
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
targets = test1.groupby(['investment_id'])['pred'].count()
targets.plot.hist(bins=60)
plt.title("frequency by investment_id distribution")
plt.show()

mean_target = test1.groupby(['investment_id'])['pred'].mean()
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
mean_target.plot.hist(bins=60)
plt.title("mean target distribution")
plt.show()

# Overall Standard deviation
fig, ax = plt.subplots(1, 1, figsize=(12, 6))
std_target = test1.groupby(['investment_id'])['pred'].std()
std_target.plot.hist(bins=60)
plt.title("standard deviation of target distribution")
plt.show()

In [None]:
ax = sns.jointplot(x=targets, y=mean_target, kind="reg", 
                   height=8, joint_kws={'line_kws':{'color':'red'}})
ax.ax_joint.set_xlabel('observations')
ax.ax_joint.set_ylabel('mean target')
plt.show()

qx = sns.jointplot(x=targets.values, y=std_target, kind="reg", 
                   height=8, joint_kws={'line_kws':{'color':'red'}})
ax.ax_joint.set_xlabel('observations')
ax.ax_joint.set_ylabel('std target')
plt.show()

In [None]:
import ubiquant

env = ubiquant.make_env()  # initialize the environment
iter_test = env.iter_test()  # an iterator which loops over the test set and sample submission

for (test_df, sample_prediction_df) in iter_test:
    #test_df.info() 
    sample_prediction_df['target'] = apply_picked(models[picked], test_df, features) # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions