In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging
logging.basicConfig(format='%(asctime)s %(levelname)-10s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.DEBUG)
%matplotlib inline

In [None]:
with pd.HDFStore('train.h5') as train:
    df = train.get('train')

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

def recurrent_linear_fit(df,col,quantile=0.99,min_train_fraction=0.9):
    logging.info("recurrent linear fit to '%s'",col)
    X = df.loc[:,[col]]
    train_index = X.dropna().index
    start_train_size = len(train_index)
    better = True
    score = -1000
    best_score = -1000
    best_model = None
    iteration = 0
    while better:
        iteration +=1
        X = df.loc[train_index,[col]]
        y = df.y[train_index]
        internal_model = Ridge(alpha=0.1,fit_intercept=False)
        internal_model.fit(X,y)
        score = r2_score(y,internal_model.predict(X))
        logging.debug("iteration %d: score = %.7f, train fraction = %.1f%%",iteration,score,len(train_index) * 100./start_train_size)
        better = score > best_score
        if better:
            best_score = score
            best_model = internal_model
            residuals = np.abs(y - internal_model.predict(X))
            train_index = residuals[residuals < residuals.quantile(quantile)].index
            if len(train_index) < min_train_fraction * start_train_size:
                better = False
        
    return best_model,best_score

In [None]:
cols2fit = df.drop(['id','timestamp','y'],axis=1).columns
logging,info("performing recurrent linear fit to %d columns",len(cols2fit))
models = {}
for c in cols2fit:
    m,s = recurrent_linear_fit(df,c)
    models[c] = {'model': m, 'score': s}

In [None]:
scores = sorted([a['score'] for a in models.values()])
plt.hist(scores,bins=50)