In [23]:
import pickle
import pandas as pd

In [24]:
stock_prices = pd.read_csv(
    'data/stock_prices.csv', index_col='Date', parse_dates=True
)

with open('data/stock_fundamentals.pkl', 'rb') as f:
    stock_fundamentals = pickle.load(f)

In [25]:
stock_prices = stock_prices['2000':]

In [26]:
stock_prices = stock_prices.resample('Q').first()

In [27]:
selected_factors = [
    'peRatio', 
    'marketCap',
    'priceToSalesRatio',
    'ptbRatio',
    'enterpriseValueOverEBITDA', 
    'netIncomePerShare', 
    'roe', 
    'returnOnTangibleAssets', 
    'freeCashFlowPerShare', 
    'operatingCashFlowPerShare', 
    'currentRatio', 
    'debtToEquity',
    'capexToOperatingCashFlow', 
    'grahamNumber',
    'incomeQuality'
]


stock_fundamentals = {
    stock: data.drop(['period', 'symbol'], axis=1)[selected_factors] \
        for stock, data in stock_fundamentals.items()
}

In [28]:
first_valid_indices = {
    stock: data.first_valid_index() \
        for stock, data in stock_fundamentals.items()
}

In [29]:
from collections import Counter

def get_most_common_index(dict):
    freqs = Counter(dict.values())
    mode, freq = freqs.most_common()[0]
    return mode, freq

most_common_index, frequency = get_most_common_index(first_valid_indices)
most_common_index, frequency

('1991-03-31', 234)

In [30]:
filtered_stock_fundamentals = {
    stock: data for stock, data in stock_fundamentals.items() \
        if first_valid_indices[stock] == most_common_index
}

for stock in filtered_stock_fundamentals.keys():
    filtered_stock_fundamentals[stock].index = pd.to_datetime(filtered_stock_fundamentals[stock].index)
    filtered_stock_fundamentals[stock] = filtered_stock_fundamentals[stock].reindex(stock_prices.index)
    filtered_stock_fundamentals[stock].interpolate(method='time', inplace=True)

In [31]:
cleaned_stock_fundamentals = {
    stock: data for stock, data in filtered_stock_fundamentals.items() \
        if data.isnull().sum().sum() == 0
}

In [32]:
stock_prices = stock_prices[cleaned_stock_fundamentals.keys()].dropna(axis=1, how='any')
cleaned_stock_fundamentals = {
    stock: data for stock, data in cleaned_stock_fundamentals.items() \
        if stock in stock_prices.columns
}

In [33]:
stock_prices.to_csv('data/cleaned_stock_prices.csv')

with open('data/cleaned_stock_fundamentals.pkl', 'wb') as f:
    pickle.dump(cleaned_stock_fundamentals, f)