In [1]:
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from sklearn.ensemble import RandomForestClassifier

In [12]:
job_type = "sentence_lda"
base = "/mnt/nfs/scratch1/hshukla/sentence_results/"
model_base = "sen_lda_{}_{}.model"
if job_type == "sentence_lda":
    window_size = 7
else:
    window_size = None

# Settings
lda_risk_path = base + model_base.format("item1a_risk", window_size)
lda_mda_path = base +  model_base.format("item7_mda", window_size)
data_path = base + "df_sen_7_3.pkl"
output_folder_path = base + "predictions/"
is_pkl = True
is_corp_filter = True
is_vanilla = job_type == "vanilla_lda"
start_year = 2012
end_year = 2015
predict_year = 2016
train_range = list(range(start_year,end_year+1))

In [3]:
# Load data
if is_pkl:
    data = pd.read_pickle(data_path)
else:
    data = pd.read_csv(data_path)
lda_risk = LdaModel.load(lda_risk_path)
lda_mda = LdaModel.load(lda_mda_path)

In [7]:
# Find subset of valid data
print("Using {} model for [{},{}] inclusive predicting for {}".format(job_type, start_year, end_year, predict_year))
data["is_dividend_payer"] = data["is_dividend_payer"].astype(bool)
data_valid = data[data["is_dividend_payer"] & data["is_dps_cut"].notnull()]
data_valid["is_dps_cut"] = data_valid["is_dps_cut"].astype(int)

# train/test
data_train = data_valid[(data_valid.year_x >= start_year) & (data_valid.year_x <= end_year)]
data_test = data_valid[data_valid.year_x == predict_year]
print("# train rows: {}".format(len(data_train)))
print("# test rows: {}".format(len(data_test)))

Using sentence_lda model for [2012,2015] inclusive predicting for 2016
# train rows: 5155
# test rows: 1307


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_valid["is_dps_cut"] = data_valid["is_dps_cut"].astype(int)


In [9]:
if is_vanilla:
    risk_docs = []
    mda_docs = []
else:
    risk_docs = [sentence_grp for doc in data_train["item1a_risk"].to_list() for sentence_grp in doc]
    mda_docs = [sentence_grp for doc in data_train["item7_mda"].to_list() for sentence_grp in doc]

In [13]:
risk_dict = Dictionary(risk_docs)
mda_dict = Dictionary(risk_docs)
if is_corp_filter: # Used filtering in sent-lda to speed things up
    risk_dict.filter_extremes(no_below=10)
    mda_dict.filter_extremes(no_below=10)
del risk_docs
del mda_docs

In [15]:
rf = RandomForestClassifier(random_state=5)

In [18]:
year = 2012
data_slice = data_train[data_train.year_x == year]
print("For year {} we have {} documents".format(year, len(data_slice)))

risk_docs = [sentence_grp for doc in data_slice["item1a_risk"].to_list() for sentence_grp in doc]
risk_corpus = [risk_dict.doc2bow(doc) for doc in risk_docs]
mda_docs = [sentence_grp for doc in data_slice["item7_mda"].to_list() for sentence_grp in doc]
mda_corpus = [mda_dict.doc2bow(doc) for doc in risk_docs]
del risk_docs
del mda_docs

risk_results = lda_risk[risk_corpus]
mda_results = lda_mda[mda_corpus]

For year 2012 we have 1199 documents


In [None]:
risk_idx = 0
mda_idx = 0
weights = []

row = data_slice[0]

In [23]:
row = data_slice.iloc[0]
num
display("Num items: {}".format(len(row)))
display("risk items in row: {}".format(len(row["item1a_risk"])))
display("mda items in row: {}".format(len(row["item7_mda"])))

'Num items: 20'

'risk items in row: 42'

'mda items in row: 76'

In [21]:
"""
window = 1: Only one 
window = 5: Two topics
window = 7: Three topics
"""


Unnamed: 0                                                        3144
Unnamed: 0.1                                                      3144
cik                                                               1750
ticker_x                                                           AIR
filing_date                                                 2012-07-19
item1a_risk          [[item, risk, factors, the, following, descrip...
item7_mda            [[item, managements, discussion, and, analysis...
year_x                                                            2012
filing_year_x                                                     2012
perm_id                                                           4332
ticker_y                                                           AIR
year_y                                                            2012
company_name                                                  AAR CORP
is_dividend_payer                                                 True
dps_ch