In [2]:
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np
import heapq
from matplotlib import pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import r2_score,  mean_absolute_percentage_error,  mean_squared_error,  mean_absolute_error
import joblib
from sklearn.tree import export_graphviz
import pydot
import imblearn
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import cohen_kappa_score
import pickle

import sys
import os.path


In [3]:
base = "/mnt/nfs/scratch1/hshukla/vanilla_results/"
output_folder = "/mnt/nfs/scratch1/hshukla/vanilla_results/"

file_base = "{}_{}_{}_{}_baselinefeatures.pkl" #train_total_2012_2015_baselinefeatures.pkl
is_dps = True
is_freq = False

if is_dps:
    target_metric = "dps"
    target_column = "is_dps_cut"
else:
    target_metric = "env"
    target_column = "d_environmental"

if is_freq:
    feature_regex = "freq*"
    feature = "freq"
else:
    feature_regex = "tfidf*"
    feature = "tfidf"
    
start_year = 2012
end_year = 2015
pred_year = 2016
RANDOM_STATE = 5
exper_num = 1

original_stdout = sys.stdout
logfile_path = "{}_{}_baseline.txt".format(target_metric,feature)
if not os.path.exists(logfile_path):
    file = open(logfile_path, 'a+')
    file.write("Init File\n")
    file.close()
out_file = open(logfile_path, 'a+')
sys.stdout = out_file

In [4]:
with open(base + file_base.format("train", target_metric, start_year, end_year), "rb") as file:
    train_df = pickle.load(file)
with open(base + file_base.format("test", target_metric, pred_year, pred_year), "rb") as file:
    test_df = pickle.load(file)

In [5]:
# Error during processing - test doesn't have some of the token in training (column omitted instead of 0s). Quick fix
def modify_test():
    tr_cols = set(list(train_df.filter(regex="freq*"))).union(set(list(train_df.filter(regex="tfidf*"))))
    tst_cols = set(list(test_df.filter(regex="freq*").columns)).union(set(list(test_df.filter(regex="tfidf*"))))

    extra_in_training = list(tr_cols.difference(tst_cols))
    extra_in_training.sort(key = lambda name: train_df.columns.get_loc(name)) #Sort so we insert starting at the leftmost col
    
    for col in extra_in_training:
        test_df.insert(train_df.columns.get_loc(col), col, [0]*len(test_df))
modify_test()
(train_df.columns == test_df.columns).all()

True

In [6]:
train_documents_weights = train_df.filter(regex=feature_regex).to_numpy().tolist()
train_documents_labels = train_df.loc[:,target_column].to_list()

test_documents_weights = test_df.filter(regex=feature_regex).to_numpy().tolist()
y_actual = test_df.loc[:,target_column].to_list()

In [7]:
def print_metrics_classif(y_real, y_predicted):
    accuracy = accuracy_score(y_real, y_predicted)
    precision = precision_score(y_real, y_predicted)
    recall = recall_score(y_real, y_predicted)
    f1 = f1_score(y_real, y_predicted)
    print("Accuracy: {:.4f}".format(accuracy))
    print("Precision: {:.4f}".format(precision))
    print("Recall: {:.4f}".format(recall))
    print("F1-score: {:.4f}".format(f1))
    print(classification_report(y_real, y_predicted, target_names=["no_cut", "yes_cut"]))
    
    cm = confusion_matrix(y_real, y_predicted)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print(cm.diagonal())
    
def train_and_validate_classification(max_depth,samp_strat=None, class_weight=None):
    print("Max_Depth: {}".format(max_depth))
    if class_weight:
        print("Using class weighting")
        print(class_weight)
        rf = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1, max_depth=max_depth, class_weight=class_weight)
    else:
        rf = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1, max_depth=max_depth)
    
    if samp_strat:
        print("SMOTE sampling: {}".format(samp_strat))
        smote = SMOTE(sampling_strategy=samp_strat, random_state=RANDOM_STATE)
        x_vals, y_vals = smote.fit_resample(train_documents_weights, train_documents_labels)
    else:
        x_vals = train_documents_weights
        y_vals = train_documents_labels
    
    # Output class counts
    counter = Counter(y_vals)
    print(counter)
    
    # Fit the model
    rf.fit(X=x_vals, y=y_vals)
    
    # Get training metrics first
    print("- - - Train - - -")
    y_pred_train = rf.predict(x_vals)
    print_metrics_classif(y_vals, y_pred_train)
    print()
    
    # Prediction metrics
    print("- - - Test - - -")
    y_pred_test = rf.predict(test_documents_weights)
    print("Cohen-Kappa: {:.4f}".format(cohen_kappa_score(y_actual, y_pred_test)))
    print_metrics_classif(y_actual, y_pred_test)
    
        
    print("\n")
    print("----------------------------------------------------------------")
    
    
    
def print_metrics_reg(y_real, y_predicted):
    r2 = r2_score(y_real, y_predicted)
    mape = mean_absolute_percentage_error(y_real, y_predicted)
    mse = mean_squared_error(y_real, y_predicted)
    mae = mean_absolute_error(y_real, y_predicted)
    print("R2: {:.4f}".format(r2))
    print("mape: {:.4f}".format(mape))
    print("mse: {:.4f}".format(mse))
    print("mae: {:.4f}".format(mae))
    
def train_and_validate_regression(max_depth, criteria):
    print("Max_Depth: {}".format(max_depth))
    print("Criteria: {}".format(criteria))
    rf = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1, max_depth=max_depth, criterion=criteria)
    
    # Fit the model
    rf.fit(X=train_documents_weights, y=train_documents_labels)
    
    # Get training metrics first
    print("- - - Train - - -")
    y_pred_train = rf.predict(train_documents_weights)
    print_metrics_reg(train_documents_labels, y_pred_train)
    print()
    
    # Prediction metrics
    print("- - - Test - - -")
    y_pred_test = rf.predict(test_documents_weights)
    print_metrics_reg(y_actual, y_pred_test)
    
    print("\n")
    print("----------------------------------------------------------------")

In [8]:
if is_dps:
    display("Here")
    depths = [3,5,7]
    sample_strategies = [.1,.3,.5,.7,1]
    for depth in depths:
        for samp_strat in sample_strategies:
            train_and_validate_classification(depth,samp_strat=samp_strat)

    for depth in depths:
        train_and_validate_classification(depth,class_weight=class_weights)
else:
    display("here2")
    depths = [3,5,7]
    criteria = ["mse", "mae"]
    for depth in depths:
        for crit in criteria:
            train_and_validate_regression(depth,crit)

'Here'

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

NameError: name 'class_weights' is not defined