In [31]:
# Load data
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.preprocessing
from sklearn.linear_model import LogisticRegression
import project_env
from imp import reload
import math
import os
import run_logreg
import project_env
from sklearn.metrics import precision_recall_curve

reload(project_env)
reload(run_logreg)

%matplotlib inline

**The `project_env` package**

I wrote this python package so loading and working with data is quicker and easier. There are several convenience methods:
* `load_split_bucket(station_id)` - Load data for a bike station that's already pre-split into train, dev and test. Includes doing data cleaning and thresholding. The output is a dictionary:
```
  {
    'train': (DataFrame, Series),
    'dev': (DataFrame, Series),
    'test': (DataFrame, Series)
  }
```
  Each `(DataFrame, Series)` tuple is the feature values and target variables, respectively.
* `merge_training(split, df)` - Given two outputs of `load()`, append the training set of the second argument to the training set of the first. This is useful when trying to load data from multiple stations, but testing on one station only.
* `binarize(data, target)` - Given output of `load()` and either 1 or -1, binarize the target variable to 0 or 1. Whatever class is in the second argument will become '1' in the new data.

**The `run_logreg` package**

We wrote this python package so running many logistic regression models with different parameters is cleaner and easier. Here are the methods included:

* `do_logreg` - Takes the result of split_data and performs a logistic regression given the input parameters. It takes a parameters called "squares" that will perform some basic feature engineering by squaring some of the variables, a penalty function (l1 or l2, defaults to l2), and a c parameter (defaults to 100,000).

* `distance` - Calculates the distance betweent two stations, based on the distance formula

* `closest_stations` - Identifies the stations closest to the input station, using the `distance` function

* `add_closest_stations` - Takes split_data for one station and its station_id, splits, merges the closest stations' data and binarizes all

* `format_plot` - Formats the plot according to the desired target_recall and whether this is a plot of empty or full

### Logistic Regression Model

In [9]:
data = project_env.load_split_bucket(519, target='y_60m', log=False)
print('done loading')

done loading


In [10]:
class Logistic_Regression_Specs():
    def __init__(self, split_data, stationid, target, empty=True, squares=False, num_append=0, C=1e5, penalty='l2'):
        self.stationid = stationid
        self.target = target
        self.split_data = split_data
        self.empty = empty
        self.squares = squares
        self.num_append = num_append
        self.penalty = penalty
        self.C = C

In [11]:
def construct_key(spec):
    key = ''
    if spec.target != 'y_60m':
        key = key + spec.target + ' '
    if spec.squares == True:
        key = key + 'squares; '
    if spec.num_append > 0:
        key = key + 'append: ' + str(spec.num_append) + '; '
    key = key + 'penalty: ' + spec.penalty + '; '
    key = key + 'c: ' + str(spec.C) + '; '
    return key

In [12]:
def run_models(list_of_specs):
    '''Creates a dictionary of models based on list of specs objects'''
    
    logregs = {}
    scalers = {}
    predictions = {}
    specs = {}
    
    for spec in list_of_specs:
        logregs[construct_key(spec)], scalers[construct_key(spec)], predictions[construct_key(spec)] = run_logreg.do_logreg(spec, plot = False) 
        specs[(construct_key(spec))] = spec
    return logregs, scalers, predictions, specs

In [73]:
def pr_curve(predictions, true_value, target_recall=0.95):
    curve = precision_recall_curve(true_value, predictions)
    precision, recall, thresholds = curve
    mp, mr, mt = project_env.max_precision_for_recall(curve, target_recall=target_recall)
    return mp, mr, mt

target_vars = ['y_10m','y_15m','y_30m','y_45m','y_60m','y_90m','y_120m']
c_list = [.01,.1,1,10,100,1000]
penalties = ['l1','l2']
target_list = {}

df_dict = {}

#Returns a dictionary within a dictionary.  Outer dictionary is keyed on target var. inner dictionary keyed on a 
# "model number" and contains the specs of the model as well as a list of the [mp,mr,mt]
# You can probably used this to find the max MR for each target variable
for target in target_vars:
    data = project_env.load_split_bucket(519, target=target, log=False)
    data_empty = data_binarized = project_env.binarize(data, -1)
    gold_labels = data_empty['dev'][1]
    models = {}
    model_id=0
    rows = 2*3*int(len(c_list))*int(len(penalties))
    
    
    columns=['specs', 'mp','mr','mt']
    df_dict[target] = pd.DataFrame(data=np.zeros((rows,len(columns))), \
                                 columns=columns) 
    
    
    
    for squares in [True, False]:
        for num_append in [0,1,10]:
            for c in c_list:
                for penalty in penalties:

                    spec = Logistic_Regression_Specs(data, 519, target, empty=True, squares=squares, num_append=num_append,C=c,penalty=penalty)
                    
                    
                    
                    logregs_e, scalers_e, predictions_e = run_logreg.do_logreg(spec, plot = False)
                    #print(predictions_e)
                    mp, mr, mt = pr_curve(predictions_e,gold_labels,target_recall=0.95)
                
                    df_dict[target]['specs'].loc[model_id] = [target,squares,num_append,c,penalty]
                    df_dict[target]['mp'].loc[model_id] = mp
                    df_dict[target]['mr'].loc[model_id] = mr
                    df_dict[target]['mt'].loc[model_id] = mt
                    #model_specs = [target,squares,num_append,c,penalty]
                    #models.setdefault(model_id,[]).append([model_specs,mp,mr,mt])
                    model_id = model_id + 1 
    
    #target_list.setdefault(target,[]).append(models)
    

X shape: (4289, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.87624750499
[[512 102]
 [ 22 366]]
X shape: (4289, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.885229540918
[[548  66]
 [ 49 339]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


X shape: (4289, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.898203592814
[[551  63]
 [ 39 349]]
X shape: (4289, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.894211576846
[[551  63]
 [ 43 345]]
X shape: (4289, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.88622754491
[[537  77]
 [ 37 351]]
X shape: (4289, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.88622754491
[[535  79]
 [ 35 353]]
X shape: (4289, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.878243512974
[[531  83]
 [ 39 349]]
X shape: (4289, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.878243512974
[[531  83]
 [ 39 349]]
X shape: (4289, 44)


  np.exp(prob, prob)


Evaluating on dev set of 1002 examples
Accuracy: 0.877245508982
[[530  84]
 [ 39 349]]
X shape: (4289, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.87624750499
[[529  85]
 [ 39 349]]
X shape: (4289, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.877245508982
[[530  84]
 [ 39 349]]
X shape: (4289, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.87624750499
[[528  86]
 [ 38 350]]
X shape: (7463, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.892215568862
[[535  79]
 [ 29 359]]
X shape: (7463, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.877245508982
[[532  82]
 [ 41 347]]
X shape: (7463, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.895209580838
[[546  68]
 [ 37 351]]
X shape: (7463, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.895209580838
[[549  65]
 [ 40 348]]
X shape: (7463, 44)
Evaluating on dev set of 1002 examples
Accuracy: 0.899201596806
[[550  64]
 [ 37 351]]
X shape: (7463, 44)
Evaluating on dev set of 1002 examples


In [75]:
for target in target_vars:
    df_dict[target].to_csv(target+".csv")