In [1]:
# Load data
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.preprocessing
from sklearn.linear_model import LogisticRegression
import project_env
from imp import reload
import math
import os
import run_logreg
import project_env
from sklearn.metrics import precision_recall_curve

#reload(project_env)
#reload(run_logreg)

%matplotlib inline


In [2]:
class Logistic_Regression_Specs():
    def __init__(self, split_data, stationid, target, empty=True, squares=False, num_append=0, C=1e5, penalty='l2'):
        self.stationid = stationid
        self.target = target
        self.split_data = split_data
        self.empty = empty
        self.squares = squares
        self.num_append = num_append
        self.penalty = penalty
        self.C = C

def construct_key(spec):
    key = ''
    if spec.target != 'y_60m':
        key = key + spec.target + ' '
    if spec.squares == True:
        key = key + 'squares; '
    if spec.num_append > 0:
        key = key + 'append: ' + str(spec.num_append) + '; '
    key = key + 'penalty: ' + spec.penalty + '; '
    key = key + 'c: ' + str(spec.C) + '; '
    return key

def run_models(list_of_specs):
    '''Creates a dictionary of models based on list of specs objects'''
    
    logregs = {}
    scalers = {}
    predictions = {}
    specs = {}
    
    for spec in list_of_specs:
        logregs[construct_key(spec)], scalers[construct_key(spec)], predictions[construct_key(spec)] = run_logreg.do_logreg(spec, plot = False) 
        specs[(construct_key(spec))] = spec
    return logregs, scalers, predictions, specs

def pr_curve(predictions, true_value, target_recall=0.95):
    curve = precision_recall_curve(true_value, predictions)
    precision, recall, thresholds = curve
    mp, mr, mt = project_env.max_precision_for_recall(curve, target_recall=target_recall)
    return mp, mr, mt


In [3]:
# test loading data
data = project_env.load_split_bucket(519, target='y_60m', log=False)
print('done loading')

done loading


In [4]:
station_id = '519'

data = project_env.load_split_bucket(station_id, target='y_30m', log=False)
data_empty = project_env.binarize(data, -1)
gold_labels_30_e = data_empty['test'][1]

data = project_env.load_split_bucket(station_id, target='y_60m', log=False)
data_empty = project_env.binarize(data, -1)
gold_labels_60_e = data_empty['test'][1]

spec_30_e = Logistic_Regression_Specs(data, station_id, 'y_30m', empty=True, squares=False, num_append=0, C=0.01, penalty='l1')
spec_60_e = Logistic_Regression_Specs(data, station_id, 'y_60m', empty=True, squares=False, num_append=0, C=100, penalty='l1')

logregs_30_e, scalers_30_e, predictions_30_e = run_logreg.do_logreg(spec_30_e, plot=False, merge_train_dev=True)
logregs_60_e, scalers_60_e, predictions_60_e = run_logreg.do_logreg(spec_60_e, plot=False, merge_train_dev=True)


Training set X shape: (5204, 22)
Trained on train set of 5204 examples
Evaluating on dev set of 968 examples
Accuracy: 0.855371900826
[[720  81]
 [ 59 108]]
Training set X shape: (5204, 22)
Trained on train set of 5204 examples
Evaluating on dev set of 968 examples
Accuracy: 0.780991735537
[[626 175]
 [ 37 130]]


In [5]:
print('predictions_60_e = ' + str(predictions_60_e.shape))
print('gold_labels_60_e = ' + str(gold_labels_60_e.shape))

print('predictions_30_e = ' + str(predictions_30_e.shape))
print('gold_labels_30_e = ' + str(gold_labels_30_e.shape))


predictions_60_e = (968,)
gold_labels_60_e = (968,)
predictions_30_e = (968,)
gold_labels_30_e = (981,)
