In [8]:
from sklearn import tree
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error

error = list()
feature_importance = list()

In [9]:
csvfiles = ["emotional_abs_merged.csv", "social_abs_merged.csv", "health_abs_merged.csv", "language_abs_merged.csv", "communication_abs_merged.csv"]

for file in csvfiles:
    df = pd.read_csv(os.path.join("merged", file))
    # Prepare data
    sample_df = df.drop(columns = ["Unnamed: 0", "SA2_MAINCODE_2016", "name", "SA2_name", 
                              "total", "at_risk", "on_track", "vulnerable_percentage",
                              "Index of Relative Socio-economic Disadvantage_decile",
                              "Index of Relative Socio-economic Adv and Disadv_decile",
                              "Index of Economic Resources_decile",
                              "Index of Education and Occupation_decile"
                               ])
    target = sample_df["vulnerable"]
    data = sample_df.drop("vulnerable", axis=1)
    feature_names = data.columns
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.20, random_state=42)
    
    #Scale data
    X_train_df = X_train.drop(columns=["code"])
    X_test_df = X_test.drop(columns=["code"])

    X_scaler = StandardScaler().fit(X_train_df)

    X_train_scaled = X_scaler.transform(X_train_df)
    X_test_scaled = X_scaler.transform(X_test_df)
    
    # Create model
    rf = RandomForestRegressor(n_estimators=100)
    rf = rf.fit(X_train_scaled, y_train)
    
    predict_values = rf.predict(X_test_scaled)
    
    output_df = pd.DataFrame(y_test)
    output_df["code"] = X_test["code"]
    output_df["predicted"] = predict_values
    output_df["percent_vulnerable"] = (output_df["vulnerable"] / X_test["Usual Resident Population"]) * 100
    
    output_df.to_csv("predictions/" + file[:-4] + "_predictions.csv", index=False)
    
    # Error
    error.append([mean_absolute_percentage_error(y_test, predict_values)])
    
    # Feature Importance
    feature_importance.append([sorted(zip(rf.feature_importances_, feature_names), reverse=True)])    

In [10]:
file = "oneormore_abs_merged.csv"

df = pd.read_csv(os.path.join("merged", file))
# Prepare data
sample_df = df.drop(columns = ["Unnamed: 0", "SA2_MAINCODE_2016", "name", "SA2_name", 
                          "total", "vulnerable_percentage",
                          "Index of Relative Socio-economic Disadvantage_decile",
                          "Index of Relative Socio-economic Adv and Disadv_decile",
                          "Index of Economic Resources_decile",
                          "Index of Education and Occupation_decile"
                           ])
target = sample_df["vulnerable"]
data = sample_df.drop("vulnerable", axis=1)
feature_names = data.columns

# Split data
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.20, random_state=42)

#Scale data
X_train_df = X_train.drop(columns=["code"])
X_test_df = X_test.drop(columns=["code"])

X_scaler = StandardScaler().fit(X_train_df)

X_train_scaled = X_scaler.transform(X_train_df)
X_test_scaled = X_scaler.transform(X_test_df)

# Create model
rf = RandomForestRegressor(n_estimators=100)
rf = rf.fit(X_train_scaled, y_train)

predict_values = rf.predict(X_test_scaled)

output_df = pd.DataFrame(y_test)
output_df["code"] = X_test["code"]
output_df["predicted"] = predict_values
output_df["percent_vulnerable"] = (output_df["vulnerable"] / X_test["Usual Resident Population"]) * 100

output_df.to_csv("predictions/" + file[:-4] + "_predictions.csv", index=False)

print(mean_absolute_percentage_error(y_test, predict_values))

# Error
error.append([mean_absolute_percentage_error(y_test, predict_values)])

# Feature Importance
feature_importance.append([sorted(zip(rf.feature_importances_, feature_names), reverse=True)])

0.2789863908749421


In [11]:
error_dict = dict(zip(csvfiles, error))
feature_importance_dict = dict(zip(csvfiles, feature_importance))

error_df = pd.DataFrame.from_dict(error_dict)
feature_importance_df = pd.DataFrame.from_dict(feature_importance_dict)

error_df.to_csv("predictions/ml_error.csv")
feature_importance_df.to_csv("predictions/ml_feature_importance.csv")

{'emotional_abs_merged.csv': [0.4808469558528847], 'social_abs_merged.csv': [0.4898734724390494], 'health_abs_merged.csv': [297447045156563.5], 'language_abs_merged.csv': [601178182816434.1], 'communication_abs_merged.csv': [1376739932599655.5]}
