In [12]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


def one_hot_encode(df,cat_vars=None,num_vars=None):
    '''performs one-hot encoding on all categorical variables and combines result with continous variables'''
    cat_df = pd.get_dummies(df[cat_vars])
    num_df = df[num_vars].apply(pd.to_numeric)
    return pd.concat([cat_df,num_df],axis=1)

def train_model(model,feature_df,target_df,num_procs,mean_mse,cv_std):
    neg_mse = cross_val_score(model,feature_df,target_df,cv=2,n_jobs=num_procs,scoring='neg_mean_squared_error')
    mean_mse[model]= -1.0*np.mean(neg_mse)
    cv_std[model]=np.std(neg_mse)

def print_summary(model,mean_mse,cv_std):
    print('\n Model:\n',model)
    print('Average MSE:\n',mean_mse[model])
    print('Standard deviation during CV:\n', cv_std[model])

def save_results(model,mean_mse,predictions,feature_importances):
    '''saves model,model summary, feature importances, and predictions'''
    with open('model.txt','w') as file:
        file.write(str(model))
    feature_importances.to_csv('feature_importances.csv')
    np.savetxt('predictions.csv',predictions,delimiter=',')