### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import accuracy_score
import xgboost as xgb

%matplotlib inline

### Importing Data

In [2]:
train = pd.read_csv('data/train.csv', index_col = 0).reset_index(drop = True)
val = pd.read_csv('data/validation.csv', index_col = 0).reset_index(drop = True)
test = pd.read_csv('data/test.csv', index_col = 0).reset_index(drop = True)

In [3]:
x_train, y_train = train.drop(['Identifier', 'Date', 'Dep_Var', 'quarter'], axis = 1), train['Dep_Var']
x_val, y_val = val.drop(['Identifier', 'Date', 'Dep_Var', 'quarter'], axis = 1), val['Dep_Var']
x_test, y_test = test.drop(['Identifier', 'Date', 'Dep_Var', 'quarter'], axis = 1), test['Dep_Var']

### Fitting Lasso Linear Regression:

In [4]:
lr_lasso = Lasso(alpha = 0.01)
lr_lasso.fit(x_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

### Obtaining Feature importances of the top featues:

In [25]:
lasso_coefs = pd.DataFrame([x_val.columns, lr_lasso.coef_]).T.rename(str, columns = {0: 'Feature', 1: 'Importance'}).sort_values(by = 'Importance', ascending = False)
lasso_coefs['Importance'] = 100* lasso_coefs['Importance']/np.abs(sum(lr_lasso.coef_))
lasso_coefs

Unnamed: 0,Feature,Importance
105,10,22.6191
104,07,21.9214
103,04,17.4153
102,01,17.3335
101,Gr2Ind_Var11,17.0959
100,Gr2Ind_Var10,12.4031
99,Gr2Ind_Var9,11.1703
98,Gr2Ind_Var6,10.9819
97,Gr2Ind_Var5,8.13196
96,Gr2Ind_Var3,7.885


In [20]:
x_val_preds = x_val.copy()
x_val_preds = pd.concat([val[['Date', 'Dep_Var']], x_val_preds], axis = 1)
x_val_preds['preds'] = lr_lasso.predict(x_val)

### Computing the Spearman Correlation on validation data:

In [73]:
def val_loss(df,pred_col, y_true):
    df.sort_values(by=['Date',pred_col], inplace = True)
    df['rank'] = 1
    df['rank'] = df.groupby(by = 'Date')['rank'].cumsum()
    val_sizes = df.groupby(by = 'Date').size()
    df['scaled_rank'] = df.apply(lambda x : 20 * x['rank'] / (1+val_sizes[x['Date']]), axis = 1)
    df['scaled_rank'] = df['scaled_rank'].astype(int)
    return df, df['scaled_rank'].corr(y_true)

In [23]:
val_loss(x_val_preds, 'preds', 'Dep_Var')

SpearmanrResult(correlation=0.03912435560526045, pvalue=1.2910150018960377e-05)