The loss function is similar to ridge regression, except that we use L1 penalty:

$$
\begin{align}
\mathcal{L} &= \frac{1}{N}\sum_{i=1}^N (y_i - \hat{y}_i)^2 + \lambda \sum_{k=1}^K |w_k|
\end{align}
$$

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

%matplotlib inline

Download the S&P 500 stock data from kaggle: https://www.kaggle.com/camnugent/sandp500

In [60]:
df = pd.read_csv("/tmp/all_stocks_5yr.csv")
df["date"] = pd.to_datetime(df["date"])
df.set_index("date", inplace=True)
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,Name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL


In [61]:
stocks = {}
for name, grp in tqdm(df.groupby("Name")):
    previous_price = grp["close"].shift()
    stock_returns = grp["close"] / previous_price
    stocks[name] = stock_returns

HBox(children=(FloatProgress(value=0.0, max=505.0), HTML(value='')))




In [63]:
name

'ZTS'

In [62]:
stocks[name]

date
2013-02-08         NaN
2013-02-11    1.006354
2013-02-12    1.014432
2013-02-13    0.994369
2013-02-14    0.991654
                ...   
2018-02-01    1.014206
2018-02-02    0.986636
2018-02-05    0.961579
2018-02-06    0.992415
2018-02-07    1.008052
Name: close, Length: 1259, dtype: float64

In [64]:
stock_returns_df = pd.concat([stocks[key] for key in stocks.keys()], axis=1)
stock_returns_df.columns = [key for key in stocks.keys()]
stock_returns_df

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACN,ADBE,ADI,...,XL,XLNX,XOM,XRAY,XRX,XYL,YUM,ZBH,ZION,ZTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-02-08,,,,,,,,,,,...,,,,,,,,,,
2013-02-11,0.989352,0.980339,0.993536,1.010422,0.988966,0.997228,0.995641,0.996726,0.987730,1.008315,...,1.002479,0.998667,0.996276,0.999300,1.003769,1.013658,0.988515,0.997363,1.002900,1.006354
2013-02-12,1.000448,0.986860,1.002679,0.974933,0.988006,1.004277,1.001168,1.004106,1.006470,1.004123,...,1.003532,1.003203,1.002039,1.000700,0.996245,1.017844,1.003098,0.997224,1.011565,1.014432
2013-02-13,1.002913,1.027330,1.004707,0.998097,0.995765,0.993186,1.004665,1.002590,0.997943,0.999784,...,1.000352,1.005854,1.002374,1.004899,1.005025,1.011091,0.994749,1.007423,1.010208,0.994369
2013-02-14,0.996201,0.954297,0.998354,0.999101,1.036859,1.002787,1.006965,0.994154,0.994847,1.006053,...,0.992963,1.016931,0.998308,0.996054,1.003750,1.007431,0.991927,1.004474,0.995554,0.991654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-02-01,0.991829,0.991900,1.002564,1.002090,1.036714,0.996187,1.000322,0.998507,0.998098,0.997497,...,0.998643,0.992742,1.020275,0.998684,0.959566,1.035704,0.992789,1.008417,1.017583,1.014206
2018-02-02,0.978306,0.966964,0.971353,0.956610,0.989943,0.967066,0.992120,0.977814,0.981242,0.972286,...,1.039685,0.974479,0.949029,0.988968,0.965802,1.010957,0.983925,0.981278,0.984904,0.986636
2018-02-05,0.957474,0.955086,0.964276,0.975016,0.950855,0.957092,0.952018,0.967686,0.972552,0.958366,...,0.985098,0.948046,0.943097,0.974692,0.992096,0.960349,0.965751,0.979251,0.953832,0.961579
2018-02-06,1.003371,1.028537,1.021300,1.041792,1.015432,0.996083,1.002214,1.018837,1.022074,1.036768,...,0.990977,1.030163,0.982815,0.998633,0.983110,0.981696,1.009774,0.992856,1.016844,0.992415


In [69]:
stock_returns_df.dropna().shape

(1258, 505)

In [68]:
stock_returns_df = stock_returns_df.iloc[1:, :]
stock_returns_df.fillna(stock_returns_df.median().to_dict(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [42]:
from sklearn.linear_model import LinearRegression, Lasso

In [73]:
y_col = "AAPL"
y = stock_returns_df[y_col]
x = stock_returns_df.drop(y_col, axis=1)
print(x.shape)

(1258, 504)


In [74]:
model = Lasso(alpha=1e-5)
model.fit(x, y)
model.coef_[model.coef_!=0]

array([-0.02006946,  0.00510233,  0.00207531,  0.00635897,  0.02139995,
        0.06872295,  0.01355047,  0.02347363,  0.05106605,  0.02697582,
        0.01053292,  0.00417121,  0.05467436,  0.02171521,  0.05696464,
        0.01847754,  0.00195575,  0.00089243,  0.0292934 ,  0.06897482,
        0.01988944,  0.00285239,  0.01277602, -0.00151686,  0.01392881,
        0.00939048,  0.04642274,  0.00305502,  0.00044084,  0.09564664,
        0.02400158,  0.02214763, -0.01700085,  0.00195108])

In [75]:
model.intercept_

0.29974589591860745

In [76]:
x.columns[model.coef_!=0]

Index(['AKAM', 'AMD', 'AMZN', 'ARNC', 'ATVI', 'AVGO', 'BBY', 'CAT', 'COL',
       'CSCO', 'DE', 'EA', 'FB', 'GLW', 'GOOG', 'HBI', 'HOLX', 'ISRG', 'LRCX',
       'MSFT', 'NCLH', 'NFLX', 'NLSN', 'NRG', 'NTAP', 'QCOM', 'QRVO', 'RMD',
       'STX', 'SWKS', 'TSS', 'TXT', 'UAA', 'WMB'],
      dtype='object')

In [82]:
y_col = "BAC"
y = stock_returns_df[y_col]
x = stock_returns_df.drop(y_col, axis=1)

model = Lasso(alpha=1e-5)
model.fit(x, y)
model.coef_[model.coef_!=0]

array([ 0.31420686,  0.04947003,  0.00412033,  0.0008302 ,  0.00035555,
       -0.00664194,  0.0250305 , -0.01498768,  0.00460399,  0.01838211,
        0.00113236,  0.17389286,  0.08260883,  0.04162725, -0.00255051,
        0.1371705 ,  0.00205606,  0.08322128, -0.00134361,  0.01985888,
        0.02197792,  0.00182359,  0.006705  ,  0.00564813])

In [78]:
x.columns[model.coef_!=0]

Index(['C', 'CFG', 'CMA', 'CMG', 'DVN', 'ED', 'ETFC', 'ETR', 'FB', 'FITB',
       'INCY', 'JPM', 'KEY', 'LNC', 'LNT', 'MS', 'NFX', 'RF', 'SCG', 'SCHW',
       'STI', 'UAA', 'URI', 'WFC'],
      dtype='object')

Elements of statistical learning: https://web.stanford.edu/~hastie/Papers/ESLII.pdf