In [38]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [41]:
joined_df = pd.read_csv('data/joined_data.csv')

industry_list = list(joined_df['Industry'].unique())
county_list = list(joined_df['County'].unique())
joined_df

Unnamed: 0,CensusYear,Industry,County,WorkingPopulation,Population,MeanIncome,pctIndustryEmployement
0,2011,"Agriculture, forestry and fishing",Carlow,1397.0,54612.0,34815.0,2.558046
1,2016,"Agriculture, forestry and fishing",Carlow,1615.0,56932.0,35234.0,2.836717
2,2011,"Agriculture, forestry and fishing",Dublin,2949.0,1273069.0,44243.0,0.231645
3,2016,"Agriculture, forestry and fishing",Dublin,2010.0,1347359.0,46976.0,0.149181
4,2011,"Agriculture, forestry and fishing",Kildare,3028.0,210312.0,42084.0,1.439766
...,...,...,...,...,...,...,...
827,2016,All industries,Cavan,30509.0,76176.0,35327.0,40.050672
828,2011,All industries,Donegal,53277.0,161137.0,31776.0,33.063170
829,2016,All industries,Donegal,58353.0,159192.0,31683.0,36.655736
830,2011,All industries,Monaghan,23005.0,60483.0,32502.0,38.035481


#Avail of a StandardScaler to get the numeric variables into the correct form for regressions analysis

In [50]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from numpy import mean, absolute, sqrt
from sklearn.model_selection import KFold, cross_val_score


def run_regression(target_variable, dataframe):
       
    catagorical_columns = ['CensusYear', 'Industry', 'County']
    numeric_columns = ['WorkingPopulation', 'Population', 'MeanIncome', 'pctIndustryEmployement']
    
    #We remove the target variable so that we can just give that list to the Pipeline
    numeric_columns.remove(target_variable)
    
    #Drop target value from the x dataframe
    X = dataframe.drop(target_variable, axis=1)
    
    #Have the y dataframe only consist of the target value
    y = dataframe[[target_variable]]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.4, random_state=13, shuffle=True)

    ct = ColumnTransformer([('Standard', StandardScaler(), numeric_columns),
                       ('OneHotEncoder', OneHotEncoder(), list(catagorical_columns))])
    
    
    pipe= Pipeline([('transformer', ct), ('model', LinearRegression())])
    pipe.fit(X_train, y_train)
    score = pipe.score(X_test, y_test)
    print(f"The score for the model targeting {target_variable} is {score}")
    

regressions_df = joined_df[joined_df['Industry']=='All industries'].reset_index(drop=True)


"""for outer_index, industry in enumerate(industry_list):
    print(f'Filtering by {industry}')
    regressions_df = joined_df[joined_df['Industry']==industry].reset_index(drop=True)"""
for inner_index, target_variable in enumerate(['WorkingPopulation', 'Population', 'MeanIncome', 'pctIndustryEmployement']):
    run_regression(target_variable, joined_df)

The score for the model targeting WorkingPopulation is 0.40455896353069043
The score for the model targeting Population is 0.9995469542066205
The score for the model targeting MeanIncome is 0.9954502101322321
The score for the model targeting pctIndustryEmployement is 0.9915416946171871
