# Data explore

- Transpose stock returns -> save to `data/stock_returns.csv` 
- Index return series -> save to `data/DJIA.csv`
- Factor returns -> save to `data/factor_returns.csv`

## Import raw data

In [1]:
import pandas as pd
raw_stock_returns = pd.read_excel('data/raw_daily_returns.xlsx')
stock_list = pd.read_excel('data/stock_list.xlsx')
raw_djia = pd.read_csv('data/Dow Jones Industrial Average Historical Data.csv')
raw_factor_returns = pd.read_excel('data/raw_factor_returns.xlsx')
raw_factor_returns

Unnamed: 0,Date,Excess Return on the Market,Small-Minus-Big Return,High-Minus-Low Return,Robust Minus Weak Return,Conservative Minus Aggressive Return,Risk-Free Return Rate (One Month Treasury Bill Rate),Momentum
0,2003-12-31,0.0001,-0.0141,0.0003,-0.0008,0.0015,0.00004,-0.0016
1,2004-01-02,-0.0017,0.0079,0.0043,-0.0058,0.0063,0.00003,0.0002
2,2004-01-05,0.0120,0.0028,0.0001,-0.0077,0.0082,0.00003,0.0054
3,2004-01-06,0.0020,0.0005,0.0026,-0.0048,0.0056,0.00003,0.0069
4,2004-01-07,0.0034,0.0051,-0.0008,-0.0052,0.0020,0.00003,0.0046
...,...,...,...,...,...,...,...,...
5029,2023-12-22,0.0020,0.0061,0.0010,-0.0065,0.0020,0.00021,-0.0048
5030,2023-12-26,0.0048,0.0083,0.0043,-0.0033,-0.0016,0.00021,-0.0021
5031,2023-12-27,0.0016,0.0017,0.0010,-0.0032,-0.0014,0.00021,0.0011
5032,2023-12-28,-0.0001,-0.0038,0.0002,-0.0032,0.0015,0.00021,-0.0048


## Stock returns

The raw_data downloaded from WRDS has extra ticker or company names. First we observed which ticker or name is redundant, then we drop the redundant columns.

In [2]:
# Rename the columns
raw_stock_returns.columns = ['permno', 'date', 'ticker', 'name', 'returns']
raw_stock_returns.head(4)


Unnamed: 0,permno,date,ticker,name,returns
0,10107,2003-12-31,MSFT,MICROSOFT CORP,-0.005451
1,10107,2004-01-02,MSFT,MICROSOFT CORP,0.002923
2,10107,2004-01-05,MSFT,MICROSOFT CORP,0.025137
3,10107,2004-01-06,MSFT,MICROSOFT CORP,0.003554


In [3]:
# Pivot the data based on the permno
stock_returns = raw_stock_returns.pivot(index='date', columns='permno', values='returns')
# Pick the permno stocks from the stock_list, and change the column name the symbol
names = stock_list[['permno', 'Symbol']].copy().set_index('permno')
names.columns = ['Symbol']
stock_returns = stock_returns[names.index]
stock_returns.columns = names['Symbol']
stock_returns['2008-03-01':]

Symbol,AAPL,AMGN,AMZN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,...,MSFT,NKE,NVDA,PG,SHW,TRV,UNH,V,VZ,WMT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-03-03,-0.026316,-0.000220,-0.031643,-0.005201,-0.025607,0.012028,-0.015908,0.000410,0.006231,0.000000,...,-0.007717,-0.001827,-0.011688,0.002569,0.011588,0.001939,-0.002797,,-0.000826,0.006050
2008-03-04,0.023741,0.002857,0.046612,0.013546,-0.013016,-0.019945,-0.015144,-0.004508,-0.005390,-0.016661,...,0.022230,0.005658,-0.009934,-0.004521,-0.023673,0.002796,0.008846,,-0.016534,-0.000401
2008-03-05,-0.001044,-0.008545,-0.005355,-0.009848,0.013720,0.001533,0.019523,-0.000819,0.023752,-0.007217,...,0.019206,0.006288,0.011462,0.009877,0.007040,0.018443,0.009838,,0.007285,-0.006379
2008-03-06,-0.028596,-0.022320,-0.034622,-0.022259,-0.014897,-0.015310,-0.021522,-0.013601,-0.011150,-0.012326,...,-0.019556,-0.016445,-0.062820,-0.002884,-0.017864,-0.014740,-0.029860,,-0.018637,0.008639
2008-03-07,0.010915,-0.001356,0.021517,0.005812,-0.036599,-0.012862,-0.018705,0.007101,-0.028929,-0.015680,...,0.010881,-0.022491,-0.014617,-0.010675,-0.008897,0.002796,-0.013098,,-0.005669,-0.001601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,-0.005547,0.017291,-0.002730,0.004221,-0.006030,0.001000,-0.003405,0.007847,0.002256,-0.010867,...,0.002784,-0.118257,-0.003266,0.007071,0.014140,0.001239,0.000827,-0.004277,0.001603,0.011951
2023-12-26,-0.002841,-0.000915,-0.000065,0.002102,0.009023,0.018150,-0.000451,0.003793,0.009004,-0.000769,...,0.000214,-0.000185,0.009195,0.004543,0.004498,0.006943,-0.000538,0.002825,-0.000267,-0.001532
2023-12-27,0.000518,0.009264,-0.000456,0.005915,-0.002626,0.008423,0.001878,0.003182,-0.003281,-0.006267,...,-0.001575,-0.008239,0.002800,0.000822,0.001919,0.008178,0.005307,-0.000888,-0.004536,0.009398
2023-12-28,0.002226,0.006736,0.000261,0.003956,-0.006677,-0.004159,-0.004274,0.000793,-0.014087,0.000221,...,0.003235,0.015775,0.002125,-0.002259,-0.001373,0.003764,0.004036,0.005677,0.004824,-0.001964


In [4]:
# change to weekly
accumulated_returns = (1 + stock_returns).cumprod()
weekly_returns = accumulated_returns.resample('W-FRI').last().pct_change().dropna()
weekly_returns.to_csv('data/stock_returns.csv')
weekly_returns

Symbol,AAPL,AMGN,AMZN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,...,MSFT,NKE,NVDA,PG,SHW,TRV,UNH,V,VZ,WMT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-03-28,0.073085,0.033408,-0.046864,-0.061548,-0.017781,0.044297,0.018270,-0.027856,0.015503,-0.028213,...,-0.043523,-0.020960,0.062635,0.001009,-0.061687,-7.742263e-03,-0.026048,-0.024709,-0.007475,-0.020853
2008-04-04,0.070415,0.001930,0.101921,0.068598,0.029672,0.009599,0.145471,0.012874,0.042012,0.007742,...,0.044787,0.040237,-0.045732,0.016705,0.084356,3.964574e-02,0.061973,0.027406,0.045642,0.043745
2008-04-11,-0.038803,0.035637,-0.063484,-0.055232,0.015995,-0.038931,-0.061886,-0.041410,0.008518,-0.033931,...,-0.030178,-0.026565,-0.013312,-0.008499,-0.014427,-9.501124e-10,-0.017260,0.025279,-0.035714,0.007353
2008-04-18,0.094468,0.001395,0.112655,0.049320,0.023419,0.145329,0.045998,0.048332,0.049324,0.038105,...,0.060820,0.017994,0.026444,-0.034888,0.040578,2.048686e-02,0.038472,0.043715,0.012363,0.027555
2008-04-25,0.053962,-0.019039,0.009488,0.049198,0.078566,-0.035530,0.065571,0.044472,-0.005259,0.032876,...,-0.005667,0.012520,0.059411,-0.009230,-0.002849,-1.987724e-03,-0.087517,0.088406,0.028032,0.023797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-01,0.006685,0.026332,0.001976,0.056806,0.063045,0.038001,0.158749,0.002275,-0.000621,-0.036227,...,-0.007736,0.057692,-0.021161,0.008456,0.023565,2.500846e-02,0.003546,0.008455,0.031275,-0.011021
2023-12-08,0.023374,-0.012222,0.002653,-0.030675,0.046308,0.010671,-0.035346,-0.001857,-0.003453,0.005833,...,-0.000748,0.021413,0.015932,-0.049194,0.026529,1.518988e-03,0.004770,-0.002769,-0.008554,-0.018893
2023-12-15,0.009504,0.023521,0.017298,0.071721,0.079975,0.101117,0.043021,0.030798,0.034925,0.006895,...,-0.009353,0.048658,0.029133,-0.008198,0.068836,8.551270e-03,-0.033923,0.008954,-0.023007,0.012462
2023-12-22,-0.020094,0.031621,0.023005,0.028032,-0.014493,0.016169,0.018119,0.004411,0.011383,-0.026107,...,0.010385,-0.111148,-0.001227,0.009169,0.004842,1.635581e-02,-0.020353,0.001550,0.003211,0.025599


In [5]:
names

Unnamed: 0_level_0,Symbol
permno,Unnamed: 1_level_1
14593,AAPL
14008,AMGN
84788,AMZN
59176,AXP
19561,BA
18542,CAT
90215,CRM
76076,CSCO
14541,CVX
26403,DIS


## Factor returns

1. Change the column(factors) names
    - Excess Return on the Market: `mktrf`
    - Small-Minus-Big: `smb`
    - High-Minus-Low: `hml`
    - Robust Minus Weak: `rmw`
    - Conservative Minus Aggressive: `cma`
    - Momentum: `umd`
    - Risk-Free Rate: `rf`
2. date to index
3. Calculate the accumulated return
4. Change the freq from daily to weekly, use Friday as the date
5. Calculate the weekly return
6. Save to `data/factor_returns.csv`

In [6]:
raw_factor_returns.columns = ['date', 'mktrf', 'smb', 'hml', 'rmw', 'cma', 'rf', 'umd']
raw_factor_returns['date'] = pd.to_datetime(raw_factor_returns['date'])
raw_factor_returns = raw_factor_returns.set_index('date')

# accumulated return
factor_accumulated_returns = (1 + raw_factor_returns).cumprod()
# resample to weekly
factor_accumulated_returns = factor_accumulated_returns.resample('W-FRI').last()

# Weekly return
factor_returns = factor_accumulated_returns.pct_change().dropna()
# Save to data
factor_returns.to_csv('data/factor_returns.csv')

factor_returns.head()

Unnamed: 0_level_0,mktrf,smb,hml,rmw,cma,rf,umd
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2004-01-09,0.014692,0.011545,0.011837,-0.031015,0.027469,0.00015,0.024227
2004-01-16,0.017361,0.009913,0.009712,-0.021018,0.018302,0.00015,0.016675
2004-01-23,0.003278,0.00797,0.007807,0.013115,-0.010812,0.00012,-0.012367
2004-01-30,-0.011946,-0.013974,-0.009902,0.009665,-0.008218,0.00015,-0.012953
2004-02-06,0.009092,-0.004665,-0.001827,0.008969,-0.005498,0.00015,-0.002957


## DJIA weekly return

In [7]:
raw_djia.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,01/26/2025,44896.53,44026.27,45054.36,44026.27,3.74B,1.06%
1,01/19/2025,44424.25,43528.65,44565.26,43528.65,2.16B,2.15%
2,01/12/2025,43487.83,41924.68,43653.25,41844.89,2.50B,3.69%
3,01/05/2025,41938.45,42835.52,43115.31,41877.3,2.19B,-1.86%
4,12/29/2024,42732.13,42863.86,42905.09,42174.8,1.66B,-0.60%


In [8]:
djia_close = raw_djia[['Date', 'Price']].copy()
djia_close.columns = ['date', 'close']
djia_close['date'] = pd.to_datetime(djia_close['date'])
djia_close = djia_close.set_index('date')
# set the close to float
djia_close['close'] = djia_close['close'].str.replace(',', '').astype(float)

# resample to weekly
djia_close = djia_close.resample('W-FRI').last()
# Weekly return
djia_returns = djia_close.pct_change().dropna()
djia_returns.columns = ['djia']
# Save to data
djia_returns.to_csv('data/DJIA.csv')
djia_returns

Unnamed: 0_level_0,djia
date,Unnamed: 1_level_1
2004-01-16,0.013541
2004-01-23,-0.003039
2004-01-30,-0.007591
2004-02-06,0.010008
2004-02-13,0.003287
...,...
2025-01-03,-0.006049
2025-01-10,-0.018573
2025-01-17,0.036944
2025-01-24,0.021533
