In [None]:
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as pp
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

# Business Description Data
## SP500 Data

In [None]:
my_path = Path('/home/lawrence/Personal/Masters/COMP0087_ Natural_Language_Processing/Project/Data/SP500')
os.chdir(my_path)
fns = os.listdir(my_path)

ticker_sp50 = []
sector_sp50 = []
bds_sp50 = []
for fn in fns:
    fn_first = fn.split('.txt')[0]
    ticker = fn_first.split('_')[0]
    sector = fn_first.split('_')[1]
    f = open(fn, 'r', encoding="utf8")
    f_text = f.read()
    f.close()
    
    ticker_sp50.append(ticker)
    sector_sp50.append(sector)
    bds_sp50.append(f_text)

## Larger Dataset - excluding SP500

In [None]:
os.chdir(Path('/home/lawrence/Personal/Masters/COMP0087_ Natural_Language_Processing/Project/Data'))
f = open("bds_1.txt", "r", encoding="utf8")
f_lines = f.readlines()
f.close()

company_ids_all = f_lines[0::2]
company_descriptions_all = f_lines[1::2]
company_tickers = [x.split(':')[0] for x in company_ids_all]
removeSP = np.in1d(np.array(company_tickers), list(ticker_sp50))

bds_all = []
ticker_all = []
for i, d in enumerate(company_descriptions_all):
    if (len(d) > 3000) and not removeSP[i]:
        bds_all.append(d)
        ticker_all.append(company_ids_all[i])

# Get Returns Data

In [None]:
start_d = np.datetime64('2018-01-01')
end_d = np.datetime64('2020-01-01')
business_ds = pd.date_range(start_d, end_d, freq='B')

my_path = Path('/home/lawrence/Personal/Masters/COMP0087_ Natural_Language_Processing/Project/Data/MarketData')
os.chdir(my_path)
price_data = pd.read_csv('Price.csv')

select_these = np.in1d(price_data.tic.values, list(ticker_sp50))
price_sp50 = price_data.loc[select_these, ['tic', 'datadate', 'prccd']]
price_sp50['datadate'] = pd.to_datetime(price_sp50['datadate'], format='%Y%m%d')
price_sp50 = pd.pivot_table(price_sp50,index='datadate',columns='tic',values='prccd')
price_sp50 = price_sp50.ffill(limit=5)
price_sp50 = price_sp50.reindex(business_ds)
price_sp50 = price_sp50.dropna(axis=0)

returns_sp50 = np.log(price_sp50) - np.log(price_sp50.shift(1))
returns_sp50 = returns_sp50.dropna(axis=0)

# Explaining Returns with LDA
## Train LDA on Larger Dataset

In [None]:
n_features = 4000
tf_vectorizer = CountVectorizer(max_features=n_features, max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(bds_all)

n_components = 20
lda_20 = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda_20.fit(tf)

## Get Features for S&P 500

In [None]:
tf_sp50 = tf_vectorizer.transform(bds_sp50)

features_sp50 = lda_20.transform(tf_sp50)

features_sp50_df = pd.DataFrame(index=ticker_sp50, data=features_sp50)

## Loop Over Dates and Perform OLS Regression

In [None]:
all_dates = returns_sp50.index
max_f = 19
adj_r2_features = []

for dd in all_dates:
    reg_data = returns_sp50.loc[[dd]].transpose().join(features_sp50_df.loc[:, 0:max_f]).dropna(axis=0).values
    y = reg_data[:, 0]
    X = reg_data[:, 1:]

    std_scaler = StandardScaler()
    X = std_scaler.fit_transform(X)

    X = sm.add_constant(X, prepend=False)
    ols_model = sm.OLS(y, X)
    res = ols_model.fit()
    adj_r2_features.append(res.rsquared_adj)

In [None]:
pp.plot(all_dates, adj_r2_features)

## Comparing Different K

Next, calculate average regression adjusted $R^2$ for different K

In [None]:
tf_sp50 = tf_vectorizer.transform(bds_sp50)
all_ks = [5, 10, 15, 20, 25, 30, 40, 50]
num_trials = 20
all_adj_r2 = []
for k in all_ks:
    print(f'Running for k = {k}')
    adj_r2_k = []
    for t in range(num_trials):
        this_lda = LatentDirichletAllocation(n_components=k, max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.)
        this_lda.fit(tf)
    
        features_sp50 = this_lda.transform(tf_sp50)
        features_sp50_df = pd.DataFrame(index=ticker_sp50, data=features_sp50)
        adj_r2_dates = []
        for dd in all_dates:
            reg_data = returns_sp50.loc[[dd]].transpose().join(features_sp50_df.loc[:, 0:max_f]).dropna(axis=0).values
            y = reg_data[:, 0]
            X = reg_data[:, 1:]

            std_scaler = StandardScaler()
            X = std_scaler.fit_transform(X)

            X = sm.add_constant(X, prepend=False)
            ols_model = sm.OLS(y, X)
            res = ols_model.fit()
            adj_r2_dates.append(res.rsquared_adj)
        adj_r2_k.append(np.mean(adj_r2_dates))
    all_adj_r2.append(adj_r2_k)

In [None]:
[np.mean(x) for x in all_adj_r2]

In [None]:
[np.std(x) for x in all_adj_r2]