In [20]:
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import heapq
from matplotlib import pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score
import joblib

In [21]:
job_type = "sentence_lda"
base = "/mnt/nfs/scratch1/hshukla/sentence_results/"
output_folder = "/mnt/nfs/scratch1/hshukla/prediction_results/"
model_base = "sen_lda_{}_{}.model"
if job_type == "sentence_lda":
    window_size = 7
else:
    window_size = None

# Settings
lda_risk_path = base + model_base.format("item1a_risk", window_size)
lda_mda_path = base +  model_base.format("item7_mda", window_size)
data_path = base + "df_sen_7_3.pkl"
output_folder_path = base + "predictions/"
is_pkl = True
is_corp_filter = True
is_vanilla = job_type == "vanilla_lda"
start_year = 2012
end_year = 2015
predict_year = 2016
train_range = list(range(start_year,end_year+1))

In [3]:
# Load data
if is_pkl:
    data = pd.read_pickle(data_path)
else:
    data = pd.read_csv(data_path)
data = data.sort_values(by=['year_x'])
lda_risk = LdaModel.load(lda_risk_path)
lda_mda = LdaModel.load(lda_mda_path)

In [4]:
# Find subset of valid data
print("Using {} model for [{},{}] inclusive predicting for {}".format(job_type, start_year, end_year, predict_year))
data["is_dividend_payer"] = data["is_dividend_payer"].astype(bool)
data_valid = data[data["is_dividend_payer"] & data["is_dps_cut"].notnull()]
data_valid["is_dps_cut"] = data_valid["is_dps_cut"].astype(int)

# train/test
data_train = data_valid[(data_valid.year_x >= start_year) & (data_valid.year_x <= end_year)]
data_test = data_valid[data_valid.year_x == predict_year]
print("# train rows: {}".format(len(data_train)))
print("# test rows: {}".format(len(data_test)))

Using sentence_lda model for [2012,2015] inclusive predicting for 2016
# train rows: 5155
# test rows: 1307


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_valid["is_dps_cut"] = data_valid["is_dps_cut"].astype(int)


In [5]:
# The dictionary is defined over the entire training dataset
if is_vanilla:
    risk_docs = []
    mda_docs = []
else:
    risk_docs = [sentence_grp for doc in data_train["item1a_risk"].to_list() for sentence_grp in doc]
    mda_docs = [sentence_grp for doc in data_train["item7_mda"].to_list() for sentence_grp in doc]

risk_dict = Dictionary(risk_docs)
mda_dict = Dictionary(risk_docs)
if is_corp_filter: # Used filtering in sent-lda to speed things up
    risk_dict.filter_extremes(no_below=10)
    mda_dict.filter_extremes(no_below=10)
del risk_docs
del mda_docs

In [6]:
train_values = data_train["is_dps_cut"].value_counts() / sum(data_train["is_dps_cut"].value_counts())
test_values = data_test["is_dps_cut"].value_counts() / sum(data_test["is_dps_cut"].value_counts())
class_weight = {0: 1.0 / train_values[0], 1: 1.0 / train_values[1]}
print("Train class membership = (0,{:.3f}) and (1,{:.3f})".format(train_values[0], train_values[1]))
print("Test class membership = (0,{:.3f}) and (1,{:.3f})".format(test_values[0], test_values[1]))
print("Using class weights:")
print(class_weight)

Train class membership = (0,0.925) and (1,0.075)
Test class membership = (0,0.921) and (1,0.079)
Using class weights:
{0: 1.0811661073825503, 1: 13.320413436692506}


In [7]:
rf = RandomForestClassifier(random_state=5, warm_start=True, n_jobs=-1, verbose=1, class_weight=class_weight)

In [8]:
"""
window = 1: Only one topics
window = 5: Two topics
window = 7: Three topics
"""
if window_size == 1:
    #Yes, it is slower to use heapify for a single max element - easier implementation tho
    num_topics = 1
elif window_size == 5:
    num_topics = 2
elif window_size == 7:
    num_topics = 3
else:
    print("ERROR")
print("Num topics per sentence: {}".format(num_topics))

Num topics per sentence: 3


In [9]:
def parse_weights(weights_arr, num_topics):
    result = np.zeros((30,1))
    if isinstance(weights_arr[0], list): # we have more than 1 set of weights
        for sentence_grp in weights_arr:
            top_topics = heapq.nlargest(num_topics, sentence_grp, key=lambda x: x[1])
            for (idx_topic, weight) in top_topics:
                result[idx_topic] += weight
    else:
        """
        Just a single set of weights -> Use it! Edge case for very short docs.
        If we were to only use top x and normalize,
        then it would seem like these documents strongly related to a topic
        -> This isn't actually hit ever I dont think
        """ 
        print("Single")
        result = np.array([topic_weight[1] for topic_weight in weights_arr], dtype=np.float64)[:,None] # grab only the weight
    return result / np.linalg.norm(result, ord=1) # Normalize before returning

In [11]:
print("For training we have {} documents".format(len(data_train)))

risk_docs = [sentence_grp for doc in data_train["item1a_risk"].to_list() for sentence_grp in doc]
risk_corpus = [risk_dict.doc2bow(doc) for doc in risk_docs]
del risk_docs

mda_docs = [sentence_grp for doc in data_train["item7_mda"].to_list() for sentence_grp in doc]
mda_corpus = [mda_dict.doc2bow(doc) for doc in mda_docs]
del mda_docs


documents_weights = np.zeros((len(data_train), 60))
idx_risk = 0
idx_mda = 0
for idx_slice in range(len(data_train)):
    row = data_train.iloc[idx_slice]
    n_risk = len(row["item1a_risk"])
    n_mda = len(row["item7_mda"])

    row_risk_results = [item for item in lda_risk[risk_corpus[idx_risk:idx_risk+n_risk]]]
    row_mda_results = [item for item in lda_mda[mda_corpus[idx_mda:idx_mda+n_mda]]]

    weights_risk = parse_weights(row_risk_results, num_topics)
    weights_mda = parse_weights(row_mda_results, num_topics)
    weights = np.concatenate((weights_risk, weights_mda), axis=0)

    documents_weights[idx_slice,:] = weights.squeeze()

    idx_risk += n_risk
    idx_mda += n_mda

For training we have 5155 documents


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished


RandomForestClassifier(class_weight={0: 1.0811661073825503,
                                     1: 13.320413436692506},
                       n_jobs=-1, random_state=5, verbose=1, warm_start=True)

In [None]:
rf.fit(X=documents_weights,y=data_train["is_dps_cut"].to_list())

In [12]:
print("For testing we have {} documents".format(len(data_test)))
test_risk_docs = [sentence_grp for doc in data_test["item1a_risk"].to_list() for sentence_grp in doc]
test_risk_corpus = [risk_dict.doc2bow(doc) for doc in test_risk_docs]

test_mda_docs = [sentence_grp for doc in data_test["item7_mda"].to_list() for sentence_grp in doc]
test_mda_corpus = [mda_dict.doc2bow(doc) for doc in test_mda_docs]

del test_risk_docs
del test_mda_docs

In [13]:
# Find testing features
test_documents_weights = np.zeros((len(data_test), 60))
idx_risk = 0
idx_mda = 0
for idx_slice in range(len(data_test)):
    row = data_test.iloc[idx_slice]
    n_risk = len(row["item1a_risk"])
    n_mda = len(row["item7_mda"])
    
    row_risk_results = [item for item in lda_risk[test_risk_corpus[idx_risk:idx_risk+n_risk]]]
    row_mda_results = [item for item in lda_mda[test_mda_corpus[idx_mda:idx_mda+n_mda]]]
    
    weights_risk = parse_weights(row_risk_results, num_topics)
    weights_mda = parse_weights(row_mda_results, num_topics)
    weights = np.concatenate((weights_risk, weights_mda), axis=0)
    
    test_documents_weights[idx_slice,:] = weights.squeeze()
    
    idx_risk += n_risk
    idx_mda += n_mda

In [14]:
y_pred = rf.predict(test_documents_weights)

[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done 100 out of 100 | elapsed:    0.1s finished


In [15]:
y_actual = data_test["is_dps_cut"].to_list()

In [16]:
accuracy = accuracy_score(y_actual, y_pred)
precision = precision_score(y_actual, y_pred)
recall = recall_score(y_actual, y_pred)
f1 = f1_score(y_actual, y_pred)
print("Accuracy: {:.4f}".format(accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1-score: {:.4f}".format(f1))'

Accuracy: 0.9212
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
feature_columns = ["risk_topic_" + str(i) for i in range(30)] + ["mda_topic_" + str(i) for i in range(30)]

# Copy and add training features
training_data = data_train.copy().reset_index()
training_features = pd.DataFrame(data=documents_weights, columns=feature_columns)
training_data_output = pd.concat([training_data, training_features], axis=1)

# Copy and add predictions + training features
testing_data = data_test.copy().reset_index()
testing_data["dps_cut_prediction"] = y_pred
testing_features =  pd.DataFrame(data=test_documents_weights, columns=feature_columns)
testing_data_output = pd.concat([testing_data, testing_features], axis=1)

# Write to disk
print("Writing to disk")
print("Writing training")
training_data_output.to_csv(output_folder + "training_{}_{}_{}_{}.csv".format(start_year, end_year, predict_year, window_size))
print("Writing testing")
testing_data_output.to_csv(output_folder + "testing_{}_{}_{}_{}.csv".format(start_year, end_year, predict_year, window_size))
print("Writing forest")
joblib.dump(rf, output_folder + 'rf_sentencelda_{}_{}_{}_{}.pkl'.format(start_year, end_year, predict_year, window_size))

Writing to disk
Writing training
Writing testing
Writing forest


['/mnt/nfs/scratch1/hshukla/prediction_results/rf_sentencelda_2012_2015_2016_7.pkl']