In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
joined_df = pd.read_csv('data/joined_data.csv')
joined_without_state_df = pd.read_csv('data/joined_data(without_state).csv')

industry_list = list(joined_df['Industry'].unique())
county_list = list(joined_df['County'].unique())
joined_df

Unnamed: 0,Year,Industry,County,WorkingPopulation,Population,EarningsEuro,pctIndustryEmployement
0,2002,"Agriculture, forestry and fishing",State,97281.0,3917203.0,80847.45,2.483430
1,2006,"Agriculture, forestry and fishing",State,89277.0,4239848.0,112227.39,2.105665
2,2011,"Agriculture, forestry and fishing",State,94247.0,4588252.0,114239.67,2.054094
3,2016,"Agriculture, forestry and fishing",State,89116.0,4761865.0,132583.20,1.871452
4,2002,"Agriculture, forestry and fishing",Carlow,1581.0,46014.0,809.06,3.435911
...,...,...,...,...,...,...,...
1723,2016,All industries,Donegal,58353.0,159192.0,3230.88,36.655736
1724,2002,All industries,Monaghan,21101.0,52593.0,914.77,40.121309
1725,2006,All industries,Monaghan,25378.0,55997.0,1296.83,45.320285
1726,2011,All industries,Monaghan,23005.0,60483.0,1274.55,38.035481


#Avail of a StandardScaler to get the numeric variables into the correct form for regressions analysis

In [4]:

def run_linear_regression(target_variable, dataframe):
    
    print(target_variable)
    
    catagorical_columns = ['Industry', 'County']
    numeric_columns = ['WorkingPopulation', 'Population', 'EarningsEuro', 'pctIndustryEmployement']
    
    #We remove the target variable so that we can just give that list to the column tranformer
    numeric_columns.remove(target_variable)
    
    #Drop target value from the x dataframe
    X = dataframe.drop(columns=target_variable, axis=1)
    
    #Have the y dataframe only consist of the target value
    y = dataframe[[target_variable]]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.4, random_state=13, shuffle=True)

    #Define how the data should be Standardised/Encoded
    ct = ColumnTransformer([('Standard', StandardScaler(), numeric_columns),
                       ('OneHotEncoder', OneHotEncoder(), list(catagorical_columns))])
  
    #Define the model pipeline
    pipe = Pipeline([('transformer', ct), ('model', LinearRegression())])

    pipe.fit(X_train, y_train)
    
    #Cross validate the model
    cv = KFold(n_splits=5, shuffle=True) 
    
    #Calc the various cv metics 
    r2score = cross_val_score(pipe, X, y, cv=cv, scoring='r2')
    meansqerror = cross_val_score(pipe, X, y, cv=cv, scoring='neg_mean_squared_error')
    meansqerrorabsolute = cross_val_score(pipe, X, y, cv=cv, scoring='neg_root_mean_squared_error')
    meanabsolutepercent = cross_val_score(pipe, X, y, cv=cv, scoring='neg_mean_absolute_percentage_error')
    
    #Print the mean of the various cv metrics
    print('Mean KFold Cross Validation; R2Sscore:', np.mean(r2score*100))
    print('Mean KFold Cross Validation; Correlation:', np.mean(np.sqrt(r2score)*100))
    print('Mean KFold Cross Validation; Mean Square Error:', np.mean(meansqerror*-1))
    print('Mean KFold Cross Validation; Mean Square Error Absolute:', np.mean(meansqerrorabsolute*-1))
    print('Mean KFold Cross Validation; Mean Square Error Absolute Percent:', np.mean(meanabsolutepercent*-1))

regressions_df = joined_without_state_df.drop(columns='Year')

run_linear_regression('EarningsEuro', regressions_df)

EarningsEuro
Mean KFold Cross Validation; R2Sscore: 99.3884948258596
Mean KFold Cross Validation; Correlation: 99.69375921006728
Mean KFold Cross Validation; Mean Square Error: 284404.6073630514
Mean KFold Cross Validation; Mean Square Error Absolute: 513.8450759574355
Mean KFold Cross Validation; Mean Square Error Absolute Percent: 0.10021534113071748
