# 0. Linear_Regression_Function_Rev02

# Linear Regression Function

Example from http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html


In [5]:
# Importing Python Libraries to use in the code



# General Import
import numpy as np
import pandas as pd
import math
import random
from scipy import stats


# System Interation
import re
import glob
import os
import sys


# PLotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks") # Set Seaborn formatting style


# Datetime functions
import time
import datetime
from dateutil.parser import parse


# Database
from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, ForeignKey
import pyodbc
import sqlite3


# Scikit Learn
from sklearn import linear_model
from sklearn.datasets import make_classification, make_multilabel_classification, make_regression
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, f_regression, mutual_info_regression, SelectKBest
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC, SVR





# Functions

In [11]:
# Regression Score
# Working

def func_regression(func_df, func_target_name, func_k, func_poly, func_cv_splits):
    
    print("Parameters: K=", func_k, ", Poly=", func_poly, ", CV Splits=", func_cv_splits)
    print("")
    
    # Data Prep ####################################################
    
    func_df = func_df.dropna() # Drop all NA values
    func_df_numeric = func_df._get_numeric_data().copy() # Get all numeric columns from DF
    
    x_cols = list(func_df_numeric) # Define x cols
    x_cols.remove(func_target_name) # remove target names from x col names
    
    
    X = func_df_numeric[x_cols].values # create x df   

    y = func_df_numeric[func_target_name].values # create y df
    y = y.ravel() # Reshape y from (m, m) to (n, ) to allow for modelling
    
    
    
    # Data Modelling ##################################################
    
    # Select 4 best parameters
    X_SelectKBest = SelectKBest(f_regression, k=func_k).fit_transform(X, y)
    
    X_fit = SelectKBest(f_regression, k=func_k).fit(X, y)
    func_arr_scores = pd.DataFrame(list(zip(list(func_df[x_cols]), list(X_fit.scores_))), columns=['col_name', 'col_score'])
    
    
    # Build Polynomical DF
    poly = PolynomialFeatures(degree=func_poly)
    X_SelectKBest_Poly = poly.fit_transform(X_SelectKBest)
    
    # Train Model
    clf = linear_model.LinearRegression()
    
    
    # Perform n-fold cross validation
    scores = cross_val_score(clf, X_SelectKBest_Poly, y)
    # scoring =
    print("CV MAE Scores (Avg): ", abs(round(np.mean(scores), 4)))
    print("")
    print("CV MAE Scores (Avg): ", abs(np.round(scores, 4)))
    
    
    
    # Predict Using Model ######################################################
    
    # Predict Scores
    clf.fit(X_SelectKBest_Poly, y)
    y_predicted = clf.predict(X_SelectKBest_Poly)
    
    
    # Add predicted columns to the df    
    func_df.insert(func_df.shape[1], func_target_name+'_pred', y_predicted)
    
     
    
    # Print Coefficients #########################################################
    
    lst_featutes = list(func_df_numeric[x_cols].columns[list(X_fit.get_support())])
    lst_params = poly.get_feature_names()
    lst_coefs = clf.coef_
    var_intercept = clf.intercept_
    
    for counter, name in enumerate(lst_featutes):
        
        lst_params = [item.replace("x"+str(counter), str(name)) for item in lst_params]
        
    lst_final = list(zip(lst_params, lst_coefs))
    
    print('')
    print('Intercept:', var_intercept)
    print('')
    print('Coefficients')
    print(pd.DataFrame(lst_final, columns=['param', 'coef']))
    
    
    # Print Errors #########################################################
    var_mae = mean_absolute_error(func_df['target'], func_df['target_pred'])
    print('')
    print('Print Errors')
    
    
    print('')
    var_mae = mean_absolute_error(func_df['target'], func_df['target_pred'])
    print('MAE (Mean Absolute Error):', round(var_mae, 4))
    
    
    var_mse = mean_squared_error(func_df['target'], func_df['target_pred'])
    print('MAE (Mean Absolute Error):', round(var_mse, 4))
    

    var_r2_error = r2_score(func_df['target'], func_df['target_pred'])
    print('R2 Error:', round(var_r2_error,4))
    
    
    
    
    # The standard error of the regression provides the absolute measure of the typical distance that the data points fall from the regression line. S is in the units of the dependent variable.


    # R-squared provides the relative measure of the percentage of the dependent variable variance that the model explains. R-squared can range from 0 to 100%.
    
    

    
    return func_df
    

# Create Test Data

In [7]:
## Create Test Data

var_n_rows = 1000 # number of rows

# Create regression dataset
X_reg, y_reg = make_regression(n_samples=var_n_rows, n_features=10, n_informative=3, 
                               n_targets=1, tail_strength=0)

# Convert data in pandas dataframe
arr_test_data = pd.DataFrame(X_reg)


# Introduce some randomness into the target
# arr_test_data['target'] = y_reg

arr_test_data['target'] = y_reg + np.random.rand(var_n_rows) * 131.32



# Model - Train and Predict

In [12]:
# func_regression(func_df, func_target_name, func_k, func_poly, func_cv_splits)

# func_df = dataframe to predict
# func_target_name = name of target in dataframe
# func_k = select k most informative columns to use
# func_poly = number of polynomials to use
# func_cv_splits = number of cross-validation splits to use


arr_data_output = func_regression(arr_test_data, 'target', 3, 2, 10)


Parameter: K= 3 , Poly= 2 , CV Splits= 10

CV MAE Scores (Avg):  0.7548

CV MAE Scores (Avg):  [0.7441 0.7516 0.7688]

Intercept: 65.48353516815948

Coefficients
  param       coef
0     1   0.000000
1     1  27.987604
2     6  25.180953
3     8  51.356795
4   1^2  -0.817101
5   1 6  -1.561236
6   1 8   0.986759
7   6^2   0.328101
8   6 8  -0.297469
9   8^2  -0.781924

Print Errors

MAE (Mean Absolute Error): 32.2796
MAE (Mean Absolute Error): 1382.6029
R2 Error: 0.7626
