In [1]:
#Import datasets
import pandas as pd
src_train_df = pd.read_csv('../problem/training.csv')
train_df = pd.read_csv('../data/train.csv', index_col=0)
val_df = pd.read_csv('../data/validation.csv', index_col=0)
test_df = pd.read_csv('../data/test.csv', index_col=0)
print(src_train_df.shape, train_df.shape, val_df.shape, test_df.shape)

(36369, 119) (23948, 110) (12421, 110) (12626, 110)


In [2]:
#Divide into X & y
X_train, y_train = train_df.drop(columns = ['Date', 'Identifier', 'Dep_Var', 'quarter']), train_df['Dep_Var']
X_val, y_val = val_df.drop(columns = ['Date', 'Identifier', 'Dep_Var', 'quarter']), val_df['Dep_Var']
X_test, y_test = test_df.drop(columns = ['Date', 'Identifier', 'Dep_Var', 'quarter']), test_df['Dep_Var']
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

(23948, 106) (23948,) (12421, 106) (12421,) (12626, 106) (12626,)


In [3]:
#ElasticNet model - definition & training
from sklearn.linear_model import ElasticNet

alpha=0.0001
l1_ratio=1.0
random_state=0
selection='random'
regr = ElasticNet(fit_intercept=False, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=0.0001, \
                  warm_start=False, positive=True, alpha=alpha, l1_ratio=l1_ratio, random_state=random_state, selection=selection)
regr.fit(X_train, y_train)
regr.coef_

array([0.00000000e+00, 0.00000000e+00, 1.66008916e-01, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 4.40425859e-02, 3.06009915e-02,
       4.25760246e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       7.37315458e-02, 0.00000000e+00, 6.78432628e-02, 0.00000000e+00,
       0.00000000e+00, 3.27485519e-02, 4.23369990e-02, 0.00000000e+00,
       0.00000000e+00, 6.08851732e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 4.57454230e-02, 1.63106332e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.01828985e-02,
       4.96292493e-02, 3.00346910e-02, 0.00000000e+00, 6.76009054e-02,
       1.93446056e-02, 0.00000000e+00, 2.88870083e-02, 0.00000000e+00,
       0.00000000e+00, 2.16085711e-02, 4.63789093e-03, 0.00000000e+00,
       0.00000000e+00, 2.55761560e-03, 0.00000000e+00, 1.16160887e-01,
      

In [4]:
#Prediction on validation set
val_df['preds'] = regr.predict(X_val)
val_df['preds'].describe()

count    12421.000000
mean         9.503561
std          0.740513
min         -3.173680
25%          9.225444
50%          9.645659
75%          9.949581
max         16.400470
Name: preds, dtype: float64

In [5]:
#Rank/quantile calculation
pred_grp_df = pd.Series([])
for date in val_df['Date'].unique():
    pred_df = val_df.loc[val_df['Date']==date]
    pred_grp_df = pd.concat([pred_grp_df, pd.qcut(pred_df['preds'], q=20, labels=range(20))])
val_df = pd.concat([val_df, pred_grp_df], axis=1)

In [6]:
val_df.head()

Unnamed: 0,Date,Dep_Var,Identifier,quarter,Gr1Ind_Var1,Gr1Ind_Var2,Gr1Ind_Var3,Gr1Ind_Var4,Gr1Ind_Var5,Gr1Ind_Var6,...,Gr2Ind_Var6,Gr2Ind_Var9,Gr2Ind_Var10,Gr2Ind_Var11,01,04,07,10,preds,0
23948,2014-10-23,0.0,6878,10,-0.297055,-0.152957,0.043798,0.09958,-0.041398,0.038189,...,-0.176121,-0.196782,-0.63709,-1.052166,0,0,0,1,9.801915,12
23949,2014-10-23,11.0,109890,10,-0.217643,-0.255899,0.863882,0.749049,0.064156,0.056421,...,-0.176121,-0.196782,-0.63709,0.244533,0,0,0,1,9.18491,4
23950,2014-10-23,5.0,105306,10,-0.290039,-0.152957,0.043798,0.09958,-0.041398,0.038189,...,-0.176121,-0.196782,-0.63709,0.244533,0,0,0,1,9.626083,9
23951,2014-10-23,17.0,14978,10,-0.127195,0.438725,-0.3118,-0.333193,-0.121308,0.024343,...,-0.176121,-0.196782,1.273872,0.013656,0,0,0,1,9.0427,3
23952,2014-10-23,5.0,4521,10,-0.137398,-0.443111,-0.16682,0.080581,-0.044017,0.036187,...,-0.176121,-0.196782,-0.63709,-0.588957,0,0,0,1,10.271431,18


In [7]:
#Correlation between true labels & prediction
from scipy.stats import spearmanr

spearmanr(val_df['Dep_Var'], val_df[0])

SpearmanrResult(correlation=0.03617871988761802, pvalue=5.5065501241875265e-05)

In [13]:
#Test
X = pd.concat([X_train, X_val])
y = pd.concat([y_train, y_val])
regr.fit(X_train, y_train)
test_df['preds'] = regr.predict(X_test)
test_df['preds'].describe()

count    12626.000000
mean         9.503561
std          0.785426
min         -1.287864
25%          9.173194
50%          9.649497
75%         10.004436
max         17.068848
Name: preds, dtype: float64

In [14]:
#Rank/quantile calculation
pred_grp_df = pd.Series([])
for date in test_df['Date'].unique():
    pred_df = test_df.loc[test_df['Date']==date]
    pred_grp_df = pd.concat([pred_grp_df, pd.qcut(pred_df['preds'], q=20, labels=range(20))])
test_df = pd.concat([test_df, pred_grp_df], axis=1)

In [17]:
test_df[['Date', 'Identifier', 0]].to_csv('test_results.csv')