In [None]:
import numpy as np
import pandas as pd
import gc

In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()

In [None]:
(market_train_df, news_train_df) = env.get_training_data()

## Functions for data processing.

In [None]:
### Process market data.
def market_process(market_train_df):
    
    market_train_df['time'] = market_train_df.time.dt.date
    market_train_df['bartrend'] = market_train_df['close'] / market_train_df['open']
    market_train_df['average'] = (market_train_df['close'] + market_train_df['open'])/2
    market_train_df['pricevolume'] = market_train_df['volume'] * market_train_df['close']
    
    # drop nans or not?
    #market_train_df.dropna(axis=0, inplace=True)
    market_train_df.drop('assetName', axis=1, inplace=True)

    # Set datatype to float32 to save space
    float_cols = {c: 'float32' for c in market_train_df.columns if c not in ['assetCode', 'time']}
    
    return market_train_df.astype(float_cols)

### process news data.
def news_process(news_train_df):
    
    news_train_df['time'] = news_train_df.time.dt.date
    news_train_df['position'] = news_train_df['firstMentionSentence'] / news_train_df['sentenceCount']
    news_train_df['coverage'] = news_train_df['sentimentWordCount'] / news_train_df['wordCount']
    droplist = ['sourceTimestamp','firstCreated','sourceId','headline','takeSequence','provider',
            'firstMentionSentence','headlineTag','marketCommentary','subjects','audiences',
            'assetName','noveltyCount12H','noveltyCount24H','noveltyCount3D','noveltyCount5D',
            'noveltyCount7D','urgency','sentimentClass']
    news_train_df.drop(droplist, axis=1, inplace=True)
    
    # Remove {} and '' from assetCodes column
    news_train_df['assetCodes'] = news_train_df['assetCodes'].apply(lambda x: x[1:-1].replace("'", ""))
    return news_train_df

## Unstack assetCodes.
def unstack_asset_codes(news_train_df):
    codes = []
    indexes = []
    for i, values in news_train_df['assetCodes'].iteritems():
        explode = values.split(", ")
        codes.extend(explode)
        repeat_index = [int(i)]*len(explode)
        indexes.extend(repeat_index)
    index_df = pd.DataFrame({'news_index': indexes, 'assetCode': codes})
    del codes, indexes
    gc.collect()
    return index_df

## Merge news on index
def merge_news_on_index(news_train_df, index_df):
    news_train_df['news_index'] = news_train_df.index.copy()

    # Merge news on unstacked assets
    news_unstack_df = index_df.merge(news_train_df, how='left', on='news_index')
    news_unstack_df.drop(['news_index', 'assetCodes'], axis=1, inplace=True)
    return news_unstack_df

## Comine multiple news reports for same assets on same day.
def group_news(news_frame):
    
    aggregations = ['mean']
    gp = news_frame.groupby(['assetCode', 'time']).agg(aggregations)
    gp.columns = pd.Index(["{}_{}".format(e[0], e[1]) for e in gp.columns.tolist()])
    gp.reset_index(inplace=True)
    # Set datatype to float32
    float_cols = {c: 'float32' for c in gp.columns if c not in ['assetCode', 'time']}
    return gp.astype(float_cols)

### Merge market and news data
def merge(market_train_df,news_agg_df):
    
    df = market_train_df.merge(news_agg_df, how='left', on=['time','assetCode'])
    # drop nans or not?
    #df.dropna(axis=0, inplace=True)
    
    del market_train_df, news_agg_df
    return df


## Data processing.

In [None]:
## Market
market_train_df = market_process(market_train_df)
gc.collect()
#market_train_df.shape # (4072956, 19) dropnans(3979902, 15)

## News
news_train_df = news_process(news_train_df)
index_df = unstack_asset_codes(news_train_df)
news_unstack_df = merge_news_on_index(news_train_df, index_df) #news_unstack_df.shape #(18821885, 23)
del news_train_df, index_df
news_agg_df = group_news(news_unstack_df) #news_agg_df.shape #((3839367, 23))
del news_unstack_df

## Merge
df = merge(market_train_df,news_agg_df) #df.shape # (4072956, 35) dropnans(1121521, 36)
gc.collect()
df.head(3)

## Prepare for training.

In [None]:
df.dropna(axis=0, inplace=True)
# df.shape #(1121521, __)

# extract useful data.
dates = df.time
num_target = df.returnsOpenNextMktres10.astype('float32')
bin_target = (df.returnsOpenNextMktres10 >= 0).astype('int8')
universe = df.universe.astype('int8')

# Drop columns that are not features
df.drop(['returnsOpenNextMktres10', 'universe', 'assetCode', 'time'], axis=1, inplace=True)
gc.collect()
df.head(3)
# df.shape #(1121521, 30)

In [None]:
from sklearn import *
from lightgbm import LGBMClassifier
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

Split into train and testing (3:1).

In [None]:
train_index, test_index = model_selection.train_test_split(df.index.values, test_size=0.25, 
                                                           random_state = 11)

Tuning parameters using validation set. 

In [None]:
# train_index2, val_index = model_selection.train_test_split(train_index, test_size=0.25, 
#                                                            random_state = 11)

In [None]:
# def evaluate_model(df, target, train_index, val_index, params):
#     model = LGBMClassifier(objective ='binary',
#                            boosting ='gbdt', #dart
#                            n_jobs = 4,
#                            max_depth = 8,
#                            num_iterations = 200,
#                            learning_rate = 0.05,
#                            **params)
#     model.fit(df.loc[train_index],bin_target.loc[train_index])
#     return metrics.log_loss(target.loc[val_index], 
#                             model.predict_proba(df.loc[val_index]))

In [None]:
# param_grid = {
#     'num_leaves': [60, 80],
#     'n_estimators': [200, 400], #default class*iteration=2*100
#     'bagging_freq': 5,
#     'bagging_fraction' : [0.8, 0.9],  # subsample
#     'feature_fraction' : [0.8, 0.9]  # colsample_bytree
#     #'reg_alpha': [0.2, 0.6, 0.8],
#     #'reg_lambda': [0.4, 0.6, 0.8]
# }

# print('Tuning begins...')
# best_eval_score = 0
# for i in range(50):
#     params = {k: np.random.choice(v) for k, v in param_grid.items()}
#     score = evaluate_model(df, bin_target, train_index2, val_index, params)
#     if score < best_eval_score or best_eval_score == 0:
#         best_eval_score = score
#         best_params = params
# print("Best evaluation logloss", best_eval_score)

In [None]:
## best parameters found.
lgb = LGBMClassifier(
    objective='binary',
    boosting='gbdt',
    learning_rate = 0.05,
    max_depth = 8,
    num_leaves = 80,
    n_estimators = 400,
    bagging_fraction = 0.8,
    feature_fraction = 0.9)
    #reg_alpha = 0.2,
    #reg_lambda = 0.4)

In [None]:
t = time.time()
print('Fitting Up')
lgb.fit(df.loc[train_index],bin_target.loc[train_index])
print('Done')
print(f'Done, time = {time.time() - t}')

In [None]:
print("lgb accuracy : %f" % \
      accuracy_score(lgb.predict(df.loc[test_index]),
                     bin_target.loc[test_index]))
print("lgb AUC : %f" % \
      roc_auc_score(bin_target.loc[test_index].values,
                    lgb.predict_proba(df.loc[test_index])[:, 1]))

In [None]:
import matplotlib.pyplot as plt
from mlxtend.evaluate import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import seaborn as sns
%matplotlib inline

import matplotlib as mpl
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelsize'] = 16
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16

Prediction confidence distribution.

In [None]:
#sample5000 = np.random.choice(test_index, size=5000)
#plt.hist(num_target.loc[sample5000].values.clip(-1,1), bins='auto', alpha=0.3)
plt.hist(lgb.predict_proba(df.loc[test_index])[:, 1]*2-1, bins='auto', alpha=0.3, color='darkorange')
#plt.legend(['Ground truth', 'Predicted'])
plt.xlabel("Confidence")
plt.ylabel("Count")
plt.title("predicted confidence")
plt.show()

Confusion matrix.

In [None]:

cfm = confusion_matrix(y_target=np.array(bin_target.loc[test_index]), 
                       y_predicted=lgb.predict(df.loc[test_index]).tolist())
fig, ax = plot_confusion_matrix(conf_mat=cfm)
plt.show()


The feature importance in lgb (default) is measured by the numbers of times the feature is used in a model.

In [None]:
feat_importance = pd.DataFrame()
feat_importance["feature"] = df.columns
feat_importance["value"] = lgb.feature_importances_
feat_importance.sort_values(by='value', ascending=False, inplace=True)

plt.figure(figsize=(8,10))
ax = sns.barplot(y="feature", x="value", data=feat_importance)

## Competition prediction

In [None]:
# You can only iterate through a result from `get_prediction_days()` once
# so be careful not to lose it once you start iterating.
days = env.get_prediction_days()

In [None]:
n_days = 0

for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days += 1
    print(n_days,end=' ')
    
    # process market data
    market_obs_df = market_process(market_obs_df)
    
    # process news data
    news_obs_df = news_process(news_obs_df)
    index_df = unstack_asset_codes(news_obs_df)
    news_unstack = merge_news_on_index(news_obs_df, index_df)
    news_obs_agg = group_news(news_unstack)

    # merge
    obs_df = merge(market_obs_df,news_obs_agg)
    del market_obs_df, news_obs_agg, news_obs_df, news_unstack, index_df
    gc.collect()
    obs_df = obs_df[obs_df.assetCode.isin(predictions_template_df.assetCode)]

    # Drop cols that are not features
    feats = [c for c in obs_df.columns if c not in ['assetCode', 'time']]

    #t = time.time()
    preds = lgb.predict_proba(obs_df[feats])[:, 1] * 2 - 1
    sub = pd.DataFrame({'assetCode': obs_df['assetCode'], 'confidence': preds})
    predictions_template_df = predictions_template_df.merge(sub, how='left').drop(
        'confidenceValue', axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})

    env.predict(predictions_template_df)
    del obs_df, predictions_template_df, preds, sub
    gc.collect()


In [None]:
env.write_submission_file()

In [None]:
# We've got a submission file!
import os
print([filename for filename in os.listdir('.') if '.csv' in filename])

## Competition results visualization

In [None]:
df_competition  = pd.read_csv('submission.csv')
df_competition.head(3)

In [None]:
plt.hist(df_competition.confidenceValue, bins='auto', alpha=0.3, color='green')
#plt.legend(['Ground truth', 'Predicted'])
plt.xlabel("Confidence")
plt.ylabel("Count")
plt.title("predicted confidence for scoring data")
plt.show()

## Restart the Kernel to run your code again
In order to combat cheating, you are only allowed to call `make_env` or iterate through `get_prediction_days` once per Kernel run.  However, while you're iterating on your model it's reasonable to try something out, change the model a bit, and try it again.  Unfortunately, if you try to simply re-run the code, or even refresh the browser page, you'll still be running on the same Kernel execution session you had been running before, and the `twosigmanews` module will still throw errors.  To get around this, you need to explicitly restart your Kernel execution session, which you can do by pressing the Restart button in the Kernel Editor's bottom Console tab:
![Restart button](https://i.imgur.com/hudu8jF.png)