In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
with pd.HDFStore('train.h5') as train:
    df = train.get('train')

In [None]:
print('Shape : {}'.format(df.shape))

In [None]:
df.head()

In [None]:
print('tracking {} assets over {} timesteps'.format(len(df.id.unique()),len(df.timestamp.unique())))

In [None]:
fundamental_df = df.filter(regex="(id|fundamental_.*)")
techincal_df = df.filter(regex="(id|technical_.*)")
derived_df = df.filter(regex="(id|derived_.*)")
print fundamental_df.shape
print techincal_df.shape
print derived_df.shape
print df.shape

In [None]:
na_df = pd.DataFrame(data=None,columns=df.columns).drop(['id','timestamp'],axis=1)

In [None]:
na_df.head()

In [None]:
for name,group in derived_df.groupby('id'):
    print("id = {}".format(name))
    for c in group.drop('id',axis=1).columns:
        nan_ind = pd.isnull(group[c]).nonzero()[0]
        if not len(nan_ind):
            continue
        print c,nan_ind[-1]*1.0/(len(nan_ind)-1)

In [None]:
market_df = df[['y','timestamp']].groupby('timestamp').agg([np.mean,np.std,len]).reset_index()
market_df.head()

In [None]:
t = market_df.timestamp
simple_return = market_df['y']['mean']
cum_sum = np.log(1+simple_return).cumsum()

In [None]:
plt.figure()
plt.plot(t,cum_sum)
plt.xlabel('timestamp')
plt.ylabel('portfolio value')

In [None]:
train = df[df.timestamp < 1000]
test = df[df.timestamp >= 1000]
train = train.fillna(0)
test = test.fillna(0)

In [None]:
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components=10)
fa.fit(train.drop(['y','id','timestamp'],axis=1))

In [None]:
h_train = train.drop(['y','id','timestamp'],axis=1)
h_test = test.drop(['y','id','timestamp'],axis=1)

In [None]:
from sklearn.linear_model import Ridge
r = Ridge(alpha=0.05)
r.fit(h_train,train.y)

In [None]:
print r.score(h_train,train.y)
print r.score(h_test,test.y)

In [None]:
print r.predict(h_train[:20])
print train.y[:20]

In [None]:
print df.shape
print df.groupby('timestamp').shift().shape

In [None]:
assets_df = df[['timestamp','id','y']]
assets_df.head()

In [None]:
my_id = 12
stamps = assets_df[assets_df.id == my_id]['timestamp']
print len(stamps)
x = market_df[market_df.timestamp.isin(stamps)]['y','mean']
y = assets_df[assets_df.id == my_id]['y']
plt.plot(x,y,'.')

In [None]:
sns.clustermap(df.corr())

In [None]:
df.groupby('id').size().value_counts().head()

In [None]:
assets_df.corr()

In [None]:
g = sns.PairGrid(assets_df, vars=["mean", "std", "len"])
g = g.map_diag(plt.hist)
g = g.map_offdiag(plt.scatter)