In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime as dt
import pylab as pl

from sklearn.cross_validation import cross_val_score 
from sklearn.cross_validation import KFold
import sklearn.preprocessing as pp
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Lasso
from sklearn.decomposition import RandomizedPCA
from sklearn.linear_model import Ridge
from sklearn import metrics
from sklearn import cross_validation
from sklearn.datasets import load_digits
from sklearn.learning_curve import learning_curve
import sklearn.decomposition
import sklearn.ensemble as sk
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV

import random
import sys
from scipy import stats
% matplotlib inline
%load_ext autoreload
%autoreload 2
from regression_functions import *
from reg_plots import *


#Import Data

In [None]:
pod_number = 'F9'

In [None]:
df_all = pd.io.parsers.read_csv(filepath_or_buffer = 'data/F9_all.csv', parse_dates = [0], index_col = 0)

##Declare whether to process raw or filtered data.

In [None]:
which_data = 0    #1 = filtered, 0 = raw
ref_column, leave_out_pod, pod_ozone = declare_filt_or_raw_dataset(which_data)

####Call the scaling function and create a dataframe with scaled data.

In [None]:
df_scaled, features, minmax_scale_fit, standard_scale_fit = scale_features_and_create_day_column(df_all, ref_column)
len(df_scaled)

###Declare whether you'd like to use holdout dates from a previous run.

The MSE is high for the cross-validation set in this plot, and the training and cross-validation scores come together in this plot tells us that we have resonably high bias. Create more features and add polynomial effects to reduce bias. 

###Plot the residuals and comparison curves.

In [None]:
num_good_feat = len(base_features)
fitted_vs_ref_plot(df_cv_lin_base, num_good_feat, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lin_base, pod_number, 1, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lin_base, pod_number, 2, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lin_base, pod_number, 3, ref_column)
resid = plot_resid_vs_conc(df_cv_lin_base, ref_column)

####Plot the graphs for the holdout data.

In [None]:
num_good_feat = len(base_features)
fitted_vs_ref_plot(df_H_lin_base, num_good_feat, ref_column)
plot_fitted_and_ref_vs_time(df_H_lin_base, pod_number, 1, ref_column)
plot_fitted_and_ref_vs_time(df_H_lin_base, pod_number, 2, ref_column)
plot_fitted_and_ref_vs_time(df_H_lin_base, pod_number, 3, ref_column)
resid = plot_resid_vs_conc(df_H_lin_base, ref_column)

#Linear Reg, Base, Low Ozone

In [None]:
#Define the model.
lin_regr = linear_model.LinearRegression()
#find the MSE
MSE_CV_low, MSE_T_low, MSE_H_low, high_MSE_cv_low, X_pred_cv_base_low, y_cv_low, df_cv_lin_base_low, df_H_lin_base_low = cross_validation_by_day(lin_regr, base_features, df_tr[df_tr['O3_ppb'] < 50], df_hold[df_hold['O3_ppb'] < 50], days_tr, ref_column, cutoff_value)

In [None]:
num_good_feat = len(base_features)
fitted_vs_ref_plot(df_cv_lin_base_low, num_good_feat, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lin_base_low, pod_number, 1, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lin_base_low, pod_number, 2, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lin_base_low, pod_number, 3, ref_column)
resid = plot_resid_vs_conc(df_cv_lin_base_low, ref_column)

#Linear Reg, Base, High Ozone

In [None]:
#Define the model.
lin_regr = linear_model.LinearRegression()
#find the MSE
MSE_CV_high, MSE_T_high, MSE_H_high, high_MSE_cv_high, X_pred_cv_base_high, y_cv_high, df_cv_lin_base_high, df_H_lin_base_high = cross_validation_by_day(lin_regr, base_features, df_tr[df_tr['O3_ppb'] > 49], df_hold[df_hold['O3_ppb'] > 49], days_tr, ref_column, cutoff_value)

In [None]:
num_good_feat = len(base_features)
fitted_vs_ref_plot(df_cv_lin_base_high, num_good_feat, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lin_base_high, pod_number, 1, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lin_base_high, pod_number, 2, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lin_base_high, pod_number, 3, ref_column)
resid = plot_resid_vs_conc(df_cv_lin_base_high, ref_column)

##Linear Regression, High & Low Combined (Cross Validation)

In [None]:
plot_error_vs_features(score, RMSE, [0,10], [1,20])

#Check Residuals

###Plot cross-validation fitted data versus reference data, and choose number of features.

In [None]:
i = 10
while i < 20:
    num_good_feat = i
    df_cv_1, df_H_1 = find_fitted_cv_values_for_best_features(df_tr, df_hold, fs_features, num_good_feat, linear_model.LinearRegression(), chunks_tr, ref_column)
    fitted_vs_ref_plot(df_cv_1, i, ref_column) 
    plot_learning_curve(lin_regr, "Learning Curve- Number of features = " + str(i), df_tr[fs_features[:i]].values, df_tr[ref_column].values, (0,10), 5, np.array([0.1, 0.3, 0.5, 0.7, 0.8, 0.85, 0.9, 0.95,0.97, 1.0]))
    i += 1

In [None]:
fs_features = ['ln_o3_rh_int_lead_5',
 'temp_rh_int_lead_90',
 '03_mult_rh',
 'ln_temp_mult_ln_rh',
 '03_sq_mult_rh_sq',
 'ln_03_mult_ln_rh',
 '03_sq_mult_rh_sq_&_temp_sq',
 'temp_rh_int_lag_60',
 'days from start',
 'ln_temp_rh_slope_lag_90',
 'o3_rh_int_lead_5',
 'o3_rh_int_lag_5',
 'O3_cu',
 'Rh_slope_lead_45',
 'e2v03_slope_lag_45',
 'ln_o3_temp_slope_lag_5']

###Enter the chosen number of features, and perform a linear regression.

In [None]:
num_good_feat = 16
best_features = fs_features[:num_good_feat]
MSE_CV, MSE_T, MSE_H, high_MSE_cv, X_pred_cv_best, y_cv_best, df_cv_lin_best, df_H_lin_best = cross_validation_by_day(lin_regr, best_features, df_tr, df_hold, days_tr, ref_column, cutoff_value)

####Print Best Features

In [None]:
best_features

### Plot the cross-validation data and residuals below.

In [None]:
df_lin_regr_best_feat_cv, df_lin_regr_best_feat_H  = find_fitted_cv_values_for_best_features(df_tr, df_hold, fs_features, num_good_feat, linear_model.LinearRegression(), chunks_tr, ref_column)
fitted_vs_ref_plot(df_lin_regr_best_feat_cv, num_good_feat, ref_column)
plot_fitted_and_ref_vs_time(df_lin_regr_best_feat_cv, pod_number, 1, ref_column)
plot_fitted_and_ref_vs_time(df_lin_regr_best_feat_cv, pod_number, 2, ref_column)
plot_fitted_and_ref_vs_time(df_lin_regr_best_feat_cv, pod_number, 3, ref_column)
resid = plot_resid_vs_conc(df_lin_regr_best_feat_cv, ref_column)

####Make the same plots for the holdout data

In [None]:
cust_mse_fact = 0.5

#Linear Regression with Base Features

In [None]:
base_features = [pod_ozone,'Temp','Rh']

In [None]:
f, ax = plt.subplots(figsize=(9, 9))
sns.corrplot(df_tr[[ref_column] + base_features], annot=True, sig_stars=False,
             diag_names=False, ax=ax)

In [None]:
#Define the model.
lin_regr = linear_model.LinearRegression()
#find the MSE
MSE_CV, MSE_T, MSE_H, high_MSE_cv, X_pred_cv_base, y_cv, df_cv_lin_base, df_H_lin_base = cross_validation_by_day(lin_regr, base_features, df_tr, df_hold, days_tr, ref_column, cutoff_value)

In [None]:
plot_hist(X_pred_cv_base, y_cv, 'Predicted Ozone Concentration')
plot_hist(y_cv, X_pred_cv_base, 'Reference Ozone Concentration')

####Plot the learning curve for a linear regression with the base features.

In [None]:
plt = plot_learning_curve(lin_regr, "Learning Curve (Linear Regression- Base Features)", df_tr[base_features].values, df_tr[ref_column].values, (0,14), days_tr, np.array([ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1.0]))

####Use the best lambda value found above to find holdout values.

In [None]:
MSE_CV_lasso, MSE_T_lasso, MSE_H_lasso, high_MSE_cv_lasso, X_pred_cv_lasso, y_cv_lasso, df_cv_lasso, df_H_lasso = cross_validation_by_day(Lasso(alpha = best_lambda_lasso), all_features, df_tr, df_hold, days_tr, ref_column, cutoff_value)

In [None]:
fitted_vs_ref_plot(df_cv_lasso, num_good_feat_ridge, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lasso, pod_number, 1, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lasso, pod_number, 2, ref_column)
resid = plot_resid_vs_conc(df_cv_lasso, ref_column)

#Lasso- Best Features

In [None]:
best_lambda_lasso_best, lambda_lasso_best, coefs_lasso_best, mean_score_lambda_lasso_best = find_best_lambda(Lasso, best_features, df_fits_best, ref_column, 'custom_mse_scoring_function', days_tr, X, y, 0.0000000000001, 100, 3, cust_mse_fact, cutoff_value)

In [None]:
plot_lambda(lambda_lasso_best, coefs_lasso_best, mean_score_lambda_lasso_best)

In [None]:
MSE_CV_lasso_best, MSE_T_lasso_best, MSE_H_lasso_best, high_MSE_cv_lasso_best, X_pred_cv_lasso_best, y_cv_lasso, df_cv_lasso_best, df_H_lasso_best = cross_validation_by_day(Lasso(alpha = best_lambda_lasso), best_features, df_tr, df_hold, days_tr, ref_column, cutoff_value)

In [None]:
fitted_vs_ref_plot(df_cv_lasso_best, num_good_feat_ridge, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lasso_best, pod_number, 1, ref_column)
plot_fitted_and_ref_vs_time(df_cv_lasso_best, pod_number, 2, ref_column)
resid = plot_resid_vs_conc(df_cv_lasso_best, ref_column)

#Support Vector Machine - Linear

In [None]:
RMSE_CV_day, df_svm_fit, df_svm_H = fit_svm_and_find_MSE(best_features, df_tr, days_tr, ref_column, cutoff_value, df_hold, cust_mse_fact)  

In [None]:
fitted_vs_ref_plot(df_svm_fit, num_good_feat_ridge, ref_column)
plot_fitted_and_ref_vs_time(df_svm_fit, pod_number, 1, ref_column)
plot_fitted_and_ref_vs_time(df_svm_fit, pod_number, 2, ref_column)
resid = plot_resid_vs_conc(df_svm_fit, ref_column)

###SVM- best feat, holdout

#Lasso- All Features

In [None]:
best_lambda_lasso, lambda_lasso, coefs_lasso, mean_score_lambda_lasso = find_best_lambda(Lasso, fs_features[:num_good_feat_ridge], df_fits, ref_column, 'custom_mse_scoring_function', days_tr, X, y, 0.000001, 100, 3, cust_mse_fact, cutoff_value)

In [None]:
plot_lambda(lambda_lasso, coefs_lasso, mean_score_lambda_lasso)