In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

In [2]:
joined_df = pd.read_csv('data/joined_data.csv')
joined_without_state_df = pd.read_csv('data/joined_data(without_state).csv')

industry_list = list(joined_df['Industry'].unique())
county_list = list(joined_df['County'].unique())
joined_without_state_df

regressions_df = joined_without_state_df.drop(columns='Year')
regressions_df = regressions_df[regressions_df['Industry'] != 'All ']
regressions_df

Unnamed: 0,Industry,County,WorkingPopulation,Population,EarningsEuro,pctIndustryEmployement
0,"Agriculture, forestry and fishing",Carlow,1581.0,46014.0,809.06,3.435911
1,"Agriculture, forestry and fishing",Carlow,1413.0,50349.0,1187.51,2.806411
2,"Agriculture, forestry and fishing",Carlow,1397.0,54612.0,1236.43,2.558046
3,"Agriculture, forestry and fishing",Carlow,1615.0,56932.0,1406.40,2.836717
4,"Agriculture, forestry and fishing",Dublin,2711.0,1122821.0,27289.04,0.241445
...,...,...,...,...,...,...
1659,All industries,Donegal,58353.0,159192.0,3230.88,36.655736
1660,All industries,Monaghan,21101.0,52593.0,914.77,40.121309
1661,All industries,Monaghan,25378.0,55997.0,1296.83,45.320285
1662,All industries,Monaghan,23005.0,60483.0,1274.55,38.035481


#Avail of a StandardScaler to get the numeric variables into the correct form for regressions analysis

In [7]:

def run_regression(target_feature, dataframe, regressor, removed_feature=None):
    
    #Define lists for features
    catagorical_columns = ['Industry', 'County']
    numeric_columns = ['WorkingPopulation', 'Population', 'EarningsEuro', 'pctIndustryEmployement']
    
    print(target_feature)
    
    if removed_feature:
        print(f'Removed feature is:{removed_feature}')
    
    #We remove the target variable so that we can just give that list to the column tranformer
    numeric_columns.remove(target_feature)
    
    if removed_feature:
        try:
            numeric_columns.remove(removed_feature)
        except ValueError:
            pass
        
        try:
            catagorical_columns.remove(removed_feature)
        except ValueError:
            pass
    
    #Drop target value from the x dataframe
    X = dataframe.drop(columns=target_feature, axis=1)
    
    #Have the y dataframe only consist of the target value
    y = dataframe[[target_feature]]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=13, shuffle=True)

    #Define how the data should be Standardised/Encoded
    ct = ColumnTransformer([('Standard', StandardScaler(), list(numeric_columns)),
                       ('OneHotEncoder', OneHotEncoder(), list(catagorical_columns))])
  
    #Define the model pipeline
    pipe = Pipeline([('transformer', ct), ('model', regressor)])

    pipe.fit(X_train, y_train)

    
    #Cross validate the model
    cv = KFold(n_splits=5, shuffle=True) 
    
    #Calc the various cv metics 
    r2score = cross_val_score(pipe, X, y, cv=cv, scoring='r2')
    meansqerror = cross_val_score(pipe, X, y, cv=cv, scoring='neg_mean_squared_error')
    meansqerrorabsolute = cross_val_score(pipe, X, y, cv=cv, scoring='neg_root_mean_squared_error')
    meanabsolutepercent = cross_val_score(pipe, X, y, cv=cv, scoring='neg_mean_absolute_percentage_error')
    
    #Print the mean of the various cv metrics
    print('Mean KFold Cross Validation; R2Sscore:', round(np.mean(r2score*100), 7))
    print('Mean KFold Cross Validation; Mean Square Error:', round(np.mean(meansqerror*-1), 7))
    print('Mean KFold Cross Validation; Mean Square Error Absolute:', round(np.mean(meansqerrorabsolute*-1), 7))
    print('Mean KFold Cross Validation; Mean Square Error Absolute Percent:', round(np.mean(meanabsolutepercent*-1), 7))
    

    #Return the fitted pipeline so we can do analysis on the output
    return pipe

regressions_models = [LinearRegression(), DecisionTreeRegressor(), SVR()]
for index, model in enumerate(regressions_models):
    print(model)
    run_regression(target_feature='EarningsEuro', dataframe=regressions_df, regressor=model, removed_feature=None)

LinearRegression()
EarningsEuro
Mean KFold Cross Validation; R2Sscore: 99.4257993
Mean KFold Cross Validation; Mean Square Error: 281033.1146945
Mean KFold Cross Validation; Mean Square Error Absolute: 521.4074675
Mean KFold Cross Validation; Mean Square Error Absolute Percent: 0.0987934
DecisionTreeRegressor()
EarningsEuro
Mean KFold Cross Validation; R2Sscore: 100.0
Mean KFold Cross Validation; Mean Square Error: 0.0
Mean KFold Cross Validation; Mean Square Error Absolute: 0.0
Mean KFold Cross Validation; Mean Square Error Absolute Percent: 0.0
SVR()
EarningsEuro


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Mean KFold Cross Validation; R2Sscore: -5.2963897
Mean KFold Cross Validation; Mean Square Error: 51543541.8338347
Mean KFold Cross Validation; Mean Square Error Absolute: 7116.3024696
Mean KFold Cross Validation; Mean Square Error Absolute Percent: 0.6819616


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [10]:
for outer_index, model in enumerate(regressions_models):
    print(f'Model type is {model}')
    for index, feature in enumerate(list(regressions_df.columns)):
        
        if feature != 'EarningsEuro':
            run_regression(target_feature='EarningsEuro', dataframe=regressions_df, regressor=model, removed_feature=feature)

Model type is LinearRegression()
EarningsEuro
Removed feature is:Industry
Mean KFold Cross Validation; R2Sscore: 99.4156336
Mean KFold Cross Validation; Mean Square Error: 277816.4995743
Mean KFold Cross Validation; Mean Square Error Absolute: 521.1938799
Mean KFold Cross Validation; Mean Square Error Absolute Percent: 0.099465
EarningsEuro
Removed feature is:County
Mean KFold Cross Validation; R2Sscore: 97.9031226
Mean KFold Cross Validation; Mean Square Error: 1054068.1435863
Mean KFold Cross Validation; Mean Square Error Absolute: 1030.4319144
Mean KFold Cross Validation; Mean Square Error Absolute Percent: 0.1947332
EarningsEuro
Removed feature is:WorkingPopulation
Mean KFold Cross Validation; R2Sscore: 99.4166359
Mean KFold Cross Validation; Mean Square Error: 278520.706195
Mean KFold Cross Validation; Mean Square Error Absolute: 518.4267483
Mean KFold Cross Validation; Mean Square Error Absolute Percent: 0.0992193
EarningsEuro
Removed feature is:Population
Mean KFold Cross Valida

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Mean KFold Cross Validation; R2Sscore: -5.2971028
Mean KFold Cross Validation; Mean Square Error: 51351807.5601116
Mean KFold Cross Validation; Mean Square Error Absolute: 7056.994568
Mean KFold Cross Validation; Mean Square Error Absolute Percent: 0.6681466
EarningsEuro
Removed feature is:County


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Mean KFold Cross Validation; R2Sscore: -5.2077581
Mean KFold Cross Validation; Mean Square Error: 51579261.2936039
Mean KFold Cross Validation; Mean Square Error Absolute: 7078.0913289
Mean KFold Cross Validation; Mean Square Error Absolute Percent: 0.6771586
EarningsEuro
Removed feature is:WorkingPopulation


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Mean KFold Cross Validation; R2Sscore: -5.5611398
Mean KFold Cross Validation; Mean Square Error: 51583544.2691734
Mean KFold Cross Validation; Mean Square Error Absolute: 7056.0862639
Mean KFold Cross Validation; Mean Square Error Absolute Percent: 0.679714
EarningsEuro
Removed feature is:Population


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Mean KFold Cross Validation; R2Sscore: -6.0533527
Mean KFold Cross Validation; Mean Square Error: 51824460.5352209
Mean KFold Cross Validation; Mean Square Error Absolute: 7187.5851423
Mean KFold Cross Validation; Mean Square Error Absolute Percent: 0.6776886
EarningsEuro
Removed feature is:pctIndustryEmployement


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Mean KFold Cross Validation; R2Sscore: -5.7025651
Mean KFold Cross Validation; Mean Square Error: 51590060.1967454
Mean KFold Cross Validation; Mean Square Error Absolute: 7051.7289088
Mean KFold Cross Validation; Mean Square Error Absolute Percent: 0.6706964
