In [1]:
import pandas as pd
import pickle
import numpy as np

from gensim.corpora import Dictionary
from gensim.matutils import corpus2csc
from gensim.models import TfidfModel

In [2]:
path = "/mnt/nfs/scratch1/hshukla/vanilla_results/"
start_year = 2012
end_year = 2015
predict_year = 2016
risk_label = "item1a"
mda_label = "item7"
corpus_base = "{}-{}_all_{}_corpus.pkl" #"2012-2015_all_item1a_corpus.pkl"
dict_base = "{}-{}_all_{}_dictionary.gnsm" #"2012-2015_all_item1a_dictionary.gnsm"
id2word_base = "{}-{}_all_{}_id2word.pkl" #"2016-2016_all_item1a_id2word.pkl"

#file_base = "{}-{}_all_{}".format(start_year, end_year)

In [3]:
with open("/mnt/nfs/scratch1/hshukla/sentence_results/df_sen_1_1_tmp.pkl", "rb") as file:
    table = pickle.load(file)

In [54]:
# Sort table and get train/test subsets
table.sort_values(["year_x", "sector"], axis=0, inplace=True)
train = table[(table.year_x >= start_year) & (table.year_x <= end_year)].reset_index(drop=True)
test = table[table.year_x == predict_year].reset_index(drop=True)

In [77]:
def create_baseline(label):
    # Load corpuses/dicts/id2words
    with open(path + corpus_base.format(start_year, end_year, label), "rb") as file:
        train_corp = pickle.load(file)
    with open(path + corpus_base.format(predict_year, predict_year, label), "rb") as file:
        test_corp = pickle.load(file)
    train_dict = Dictionary.load(path + dict_base.format(start_year, end_year, label))
    with open(path + id2word_base.format(predict_year, predict_year, label), "rb") as file:
        test_id2word = pickle.load(file)
        
    # Remap test corpus to remove words not found in train and remap with train_dict
    valid_words = set(list(train_dict.id2token.values()))
    test_documents = []
    for doc in test_corp:
        words = []
        for (id,count) in doc:
            word = test_id2word.get(id)
            if word in valid_words:
                words.extend([word] * count)
        test_documents.append(words)
    test_corp = [train_dict.doc2bow(doc) for doc in test_documents]
    
    # Frequency based features
    train_freq_features = pd.DataFrame(np.array(corpus2csc(train_corp).toarray()).T).reset_index(drop=True)
    train_freq_features.columns = ["freq_" + label + "_" + str(train_dict.id2token.get(int(col))) for col in train_freq_features.columns]
    test_freq_features = pd.DataFrame(np.array(corpus2csc(test_corp).toarray()).T).reset_index(drop=True)
    test_freq_features.columns = ["freq_" + label + "_" + str(train_dict.id2token.get(int(col))) for col in test_freq_features.columns]

    # TFIDF features
    tfidf = TfidfModel(train_corp)
    train_tfidf_features = []
    for feature in tfidf[train_corp]:
        train_tfidf_features.append(feature)
    train_tfidf_features = pd.DataFrame(np.array(corpus2csc(train_tfidf_features).toarray()).T).reset_index(drop=True)
    train_tfidf_features.columns = ["tfidf_" + label + "_" + str(train_dict.id2token.get(int(col))) for col in train_tfidf_features.columns]

    test_tfidf_features = []
    for feature in tfidf[test_corp]:
        test_tfidf_features.append(feature)
    test_tfidf_features = pd.DataFrame(np.array(corpus2csc(test_tfidf_features).toarray()).T).reset_index(drop=True)
    test_tfidf_features.columns = ["tfidf_" + label + "_" + str(train_dict.id2token.get(int(col))) for col in test_tfidf_features.columns]

    return([train_freq_features, train_tfidf_features], [test_freq_features, test_tfidf_features])

In [79]:
train_dfs = []
test_dfs = []
for label in [risk_label, mda_label]:
    tr, tst = create_baseline(label)
    train_dfs.extend(tr)
    test_dfs.extend(tst)

train_total = train
test_total = test
for (tr,tst) in zip(train_dfs, test_dfs):
    train_total = train_total.merge(tr, left_index=True, right_index=True)
    test_total = test_total.merge(tst, left_index=True, right_index=True)

In [80]:
train_total

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,cik,ticker_x,filing_date,item1a_risk,item7_mda,year_x,filing_year_x,perm_id,...,tfidf_item7_pushdown,tfidf_item7_clarify_principle,tfidf_item7_avail_exemption,tfidf_item7_asu_simplifying,tfidf_item7_interest_imputation,tfidf_item7_subtopic_simplifying,tfidf_item7_concept_extraordinary,tfidf_item7_acquirer_obtains,tfidf_item7_eliminating_concept,tfidf_item7_pushdown_accounting
0,3149,3149,2969,APD,2012-11-20,"[[item], [risk, factors, conjunction, evaluati...","[[item], [management, discussion, and, analysi...",2012,2012,2799,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
1,3153,3153,3570,LNG,2012-02-24,"[[item], [risk, factors, the, following, impor...","[[item], [management, discussion, and, analysi...",2012,2012,3037,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
2,3155,3155,4447,HES,2012-02-27,"[[item], [risk, factors, related, our, busines...","[[item], [management, discussion, analysis, fi...",2012,2012,2916,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
3,3164,3164,5981,AVD,2012-03-09,"[[item], [risk, factors, the, regulatory, clim...","[[item, management, discussion, and, analysis,...",2012,2012,1985,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
4,3165,3165,6176,AP,2012-03-15,"[[item], [risk, factors, from, time, time, imp...","[[item], [management, discussion, and, analysi...",2012,2012,2149,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10612,13195,13195,1379895,DYN,2015-02-25,"[[item], [risk, factors, please, note, risk, u...","[[item], [management, discussion, analysis, fi...",2015,2015,3053,...,0.026388,0.0,0.0,0.0,0.017213,0.0,0.0,0.009373,0.0,0.009207
10613,13389,13389,1466593,OTTR,2015-03-02,"[[item], [risk, factors, risk, factors, and, c...","[[item], [management, discussion, and, analysi...",2015,2015,1525,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
10614,13595,13595,1561660,PEGI,2015-03-02,"[[item], [risk, factors], [risk, factors, you,...","[[item], [management, discussion, analysis, fi...",2015,2015,635,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
10615,13673,13673,1587732,OGS,2015-02-19,"[[item], [risk, factorsour, investor, consider...","[[item], [management, discussion, and, analysi...",2015,2015,2750,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000


In [5]:
out_base = "{}_{}_{}_{}_baselinefeatures.pkl" #"train_total_2012_2015_baselinefeatures.pkl" 

In [91]:
# Output all features
train_total.to_pickle(path + out_base.format("train", "total", start_year, end_year), protocol=0)
test_total.to_pickle(path + out_base.format("test", "total", predict_year, predict_year), protocol=0)

In [3]:
# Output divided payer features
def get_dividend_payers(data):
    data["is_dividend_payer"] = data["is_dividend_payer"].astype(bool)
    data_valid = data[data["is_dividend_payer"] & data["is_dps_cut"].notnull()]
    data_valid["is_dps_cut"] = data_valid["is_dps_cut"].astype(int)
    return data_valid

# Output environmental score features
def get_environmental(data):
    return data[data.d_environmental.notnull()]

In [6]:
# get_dividend_payers(train_total).to_pickle(path + out_base.format("train", "dps", start_year, end_year), protocol=0)
# get_dividend_payers(test_total).to_pickle(path + out_base.format("test", "dps", predict_year, predict_year), protocol=0)
with open(path + out_base.format("train", "total", start_year, end_year), "rb") as file:
    train_total = pickle.load(file)
with open(path + out_base.format("test", "total", predict_year, predict_year), "rb") as file:
    test_total = pickle.load(file)

In [7]:
get_environmental(train_total).to_pickle(path + out_base.format("train", "env", start_year, end_year), protocol=0)
get_environmental(test_total).to_pickle(path + out_base.format("test", "env", predict_year, predict_year), protocol=0)