In [1]:
#Import datasets
import pandas as pd
src_train_df = pd.read_csv('../problem/training.csv')
train_df = pd.read_csv('../data/train.csv', index_col=0)
val_df = pd.read_csv('../data/validation.csv', index_col=0)
test_df = pd.read_csv('../data/test.csv', index_col=0)
print(src_train_df.shape, train_df.shape, val_df.shape, test_df.shape)

(36369, 119) (23948, 110) (12421, 110) (12626, 110)


In [2]:
#Split into X & y
X_train, y_train = train_df.drop(columns = ['Date', 'Identifier', 'Dep_Var', 'quarter']), train_df['Dep_Var']
X_val, y_val = val_df.drop(columns = ['Date', 'Identifier', 'Dep_Var', 'quarter']), val_df['Dep_Var']
X_test, y_test = test_df.drop(columns = ['Date', 'Identifier', 'Dep_Var', 'quarter']), test_df['Dep_Var']
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

(23948, 106) (23948,) (12421, 106) (12421,) (12626, 106) (12626,)


In [3]:
#ElasticNet model - definition & training
from sklearn.linear_model import ElasticNet

alpha=0.0001
l1_ratio=1.0
random_state=0
selection='random'
regr = ElasticNet(fit_intercept=False, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=0.0001, \
                  warm_start=False, positive=True, alpha=alpha, l1_ratio=l1_ratio, random_state=random_state, selection=selection)
regr.fit(X_train, y_train)
regr.coef_

array([0.00000000e+00, 0.00000000e+00, 1.66008916e-01, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 4.40425859e-02, 3.06009915e-02,
       4.25760246e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       7.37315458e-02, 0.00000000e+00, 6.78432628e-02, 0.00000000e+00,
       0.00000000e+00, 3.27485519e-02, 4.23369990e-02, 0.00000000e+00,
       0.00000000e+00, 6.08851732e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 4.57454230e-02, 1.63106332e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.01828985e-02,
       4.96292493e-02, 3.00346910e-02, 0.00000000e+00, 6.76009054e-02,
       1.93446056e-02, 0.00000000e+00, 2.88870083e-02, 0.00000000e+00,
       0.00000000e+00, 2.16085711e-02, 4.63789093e-03, 0.00000000e+00,
       0.00000000e+00, 2.55761560e-03, 0.00000000e+00, 1.16160887e-01,
      

In [4]:
#Prediction on validation set
val_df['preds'] = regr.predict(X_val)
val_df['preds'].describe()

count    12421.000000
mean         9.503561
std          0.740513
min         -3.173680
25%          9.225444
50%          9.645659
75%          9.949581
max         16.400470
Name: preds, dtype: float64

In [6]:
#Rank/quantile calculation
def pred_rank_grp(df):
    pred_grp_df = pd.Series([])
    for date in df['Date'].unique():
        pred_df = df.loc[df['Date']==date]
        pred_grp_df = pd.concat([pred_grp_df, pd.qcut(pred_df['preds'], q=20, labels=range(20))])
    df = pd.concat([df, pred_grp_df], axis=1)
    
    return df

In [16]:
val_df = pred_rank_grp(val_df)
#val_df.drop(columns = [0],  inplace=True)
val_df.head()

Unnamed: 0,Date,Dep_Var,Identifier,quarter,Gr1Ind_Var1,Gr1Ind_Var2,Gr1Ind_Var3,Gr1Ind_Var4,Gr1Ind_Var5,Gr1Ind_Var6,...,Gr2Ind_Var6,Gr2Ind_Var9,Gr2Ind_Var10,Gr2Ind_Var11,01,04,07,10,preds,0
23948,2014-10-23,0.0,6878,10,-0.297055,-0.152957,0.043798,0.09958,-0.041398,0.038189,...,-0.176121,-0.196782,-0.63709,-1.052166,0,0,0,1,9.801915,12
23949,2014-10-23,11.0,109890,10,-0.217643,-0.255899,0.863882,0.749049,0.064156,0.056421,...,-0.176121,-0.196782,-0.63709,0.244533,0,0,0,1,9.18491,4
23950,2014-10-23,5.0,105306,10,-0.290039,-0.152957,0.043798,0.09958,-0.041398,0.038189,...,-0.176121,-0.196782,-0.63709,0.244533,0,0,0,1,9.626083,9
23951,2014-10-23,17.0,14978,10,-0.127195,0.438725,-0.3118,-0.333193,-0.121308,0.024343,...,-0.176121,-0.196782,1.273872,0.013656,0,0,0,1,9.0427,3
23952,2014-10-23,5.0,4521,10,-0.137398,-0.443111,-0.16682,0.080581,-0.044017,0.036187,...,-0.176121,-0.196782,-0.63709,-0.588957,0,0,0,1,10.271431,18


In [8]:
#Correlation between true labels & prediction
from scipy.stats import spearmanr

spearmanr(val_df['Dep_Var'], val_df[0])

SpearmanrResult(correlation=0.03617871988761802, pvalue=5.5065501241875265e-05)

In [9]:
#Accuracy
from sklearn.metrics import accuracy_score

accuracy_score(val_df['Dep_Var'], val_df[0])

0.05901296191933016

In [10]:
#Test
X = pd.concat([X_train, X_val])
y = pd.concat([y_train, y_val])
regr.fit(X, y)
test_df['preds'] = regr.predict(X_test)
test_df['preds'].describe()

count    12626.000000
mean         9.503296
std          0.671739
min         -2.074255
25%          9.240884
50%          9.668655
75%          9.916262
max         14.652937
Name: preds, dtype: float64

In [17]:
#Rank/quantile calculation
test_df = pred_rank_grp(test_df)
test_df.head()

Unnamed: 0,Date,Dep_Var,Identifier,quarter,Gr1Ind_Var1,Gr1Ind_Var2,Gr1Ind_Var3,Gr1Ind_Var4,Gr1Ind_Var5,Gr1Ind_Var6,...,Gr2Ind_Var6,Gr2Ind_Var9,Gr2Ind_Var10,Gr2Ind_Var11,01,04,07,10,preds,0
0,2015-10-23,,39380,10,-0.25666,-0.05034,-0.085564,0.064481,-0.035336,-0.003673,...,-0.183059,-0.205317,-0.678486,0.770893,0,0,0,1,10.235437,19
1,2015-10-23,,27686,10,-0.274452,-0.116607,-0.320745,-0.607889,-0.089114,-0.048199,...,-0.183059,-0.205317,-0.678486,-0.642672,0,0,0,1,9.471736,7
2,2015-10-23,,130,10,-0.238964,0.087118,0.012509,0.053826,-0.04329,0.004229,...,-0.183059,-0.205317,-0.678486,-0.642672,0,0,0,1,9.140933,3
3,2015-10-23,,40130,10,-0.275121,-0.071648,-0.013931,-0.019031,-0.041761,-0.004135,...,-0.183059,-0.205317,-0.678486,1.183,0,0,0,1,9.559584,8
4,2015-10-23,,30884,10,-0.263035,0.116437,-0.23062,-0.010651,-0.040844,0.002679,...,-0.183059,-0.205317,-0.678486,-0.642672,0,0,0,1,10.159049,19


In [46]:
test_df[['Date', 'Identifier', 0]].to_csv('test_results.csv')

In [18]:
for date in test_df['Date'].unique():
    print(date)
    print(len(test_df[test_df['Date']==date][0].unique()))

2015-10-23
20
2016-01-21
20
2016-04-21
20
2016-07-22
20


In [19]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(fit_intercept=False, random_state=0)

lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
val1_df = val_df.copy()
val1_df.drop(columns = [0],  inplace=True)
val1_df['preds'] = lr.predict(X_val)
val1_df['preds'].describe()

count    12421.000000
mean        10.123017
std          5.943237
min          0.000000
25%          6.000000
50%         10.000000
75%         15.000000
max         19.000000
Name: preds, dtype: float64

In [21]:
val1_df.head()

Unnamed: 0,Date,Dep_Var,Identifier,quarter,Gr1Ind_Var1,Gr1Ind_Var2,Gr1Ind_Var3,Gr1Ind_Var4,Gr1Ind_Var5,Gr1Ind_Var6,...,Gr2Ind_Var5,Gr2Ind_Var6,Gr2Ind_Var9,Gr2Ind_Var10,Gr2Ind_Var11,01,04,07,10,preds
23948,2014-10-23,0.0,6878,10,-0.297055,-0.152957,0.043798,0.09958,-0.041398,0.038189,...,0.411032,-0.176121,-0.196782,-0.63709,-1.052166,0,0,0,1,3.0
23949,2014-10-23,11.0,109890,10,-0.217643,-0.255899,0.863882,0.749049,0.064156,0.056421,...,0.037837,-0.176121,-0.196782,-0.63709,0.244533,0,0,0,1,12.0
23950,2014-10-23,5.0,105306,10,-0.290039,-0.152957,0.043798,0.09958,-0.041398,0.038189,...,-0.741523,-0.176121,-0.196782,-0.63709,0.244533,0,0,0,1,19.0
23951,2014-10-23,17.0,14978,10,-0.127195,0.438725,-0.3118,-0.333193,-0.121308,0.024343,...,-0.632472,-0.176121,-0.196782,1.273872,0.013656,0,0,0,1,2.0
23952,2014-10-23,5.0,4521,10,-0.137398,-0.443111,-0.16682,0.080581,-0.044017,0.036187,...,-0.632472,-0.176121,-0.196782,-0.63709,-0.588957,0,0,0,1,12.0


In [22]:
spearmanr(val1_df['Dep_Var'], val1_df['preds'])

SpearmanrResult(correlation=0.014367825136525465, pvalue=0.10933063801752209)

In [23]:
accuracy_score(val1_df['Dep_Var'], val1_df['preds'])

0.07672490137670075