In [1]:
import pandas as pd
import numpy as np

import os
import random

from numpy.random import seed

from keras.layers.experimental import preprocessing
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

In [39]:
OUTPUT_PATH = './Data/output/'
MODEL_OUTPUT_PATH = './Data/Output/Statistical Model Output/'

### Import dataset

In [3]:
dataset_raw = pd.read_csv(OUTPUT_PATH + 'CleanData_Population_Schools_SDGs.csv')
dataset_raw['No_of_Schools'] = dataset_raw['SCH_TYPE_PRIVATE'] + dataset_raw['SCH_TYPE_PUBLIC'] + dataset_raw['SCH_TYPE_SUC']
len(dataset_raw)

2482

In [4]:
dataset = dataset_raw.dropna().reset_index()
len(dataset)

1756

### Preprocess data

In [5]:
SDG = dataset.iloc[:,85:]
SDG['PSGC_BRGY'] = dataset['PSGC_BRGY']

#### 1. Elementary

In [6]:
data_elem = SDG.copy()
data_elem['NUM_SCHOOLS'] = dataset['SCH_CAT_CES'] + dataset['SCH_CAT_ES'] + dataset['SCH_CAT_PS'] + dataset['SCH_CAT_PES']
data_elem['ELEM_POPN'] = dataset['both_age_6_to_12']
len(data_elem)

1756

In [7]:
zero_schools_elem = data_elem[data_elem['NUM_SCHOOLS'] == 0]
data_elem = data_elem[data_elem['NUM_SCHOOLS'] != 0]
data_elem = data_elem.drop(['PSGC_BRGY'], axis=1)
len(data_elem)

1703

#### 2. High School

In [8]:
data_hs = SDG.copy()
data_hs['NUM_SCHOOLS'] = dataset['SCH_CAT_JHS'] + dataset['SCH_CAT_SHS'] + dataset['SCH_CAT_PJHS'] + dataset['SCH_CAT_PSHS']
data_hs['HS_POPN'] = dataset['both_age_13_to_16']
len(data_hs)

1756

In [9]:
zero_schools_hs = data_hs[data_hs['NUM_SCHOOLS'] == 0]
data_hs = data_hs[data_hs['NUM_SCHOOLS'] != 0]
data_hs = data_hs.drop(['PSGC_BRGY'], axis=1)
len(data_hs)

366

### Modelling

In [27]:
def rfr_model(data, feature):
    training_data, testing_data = train_test_split(data, test_size=0.2, random_state=42)
    X_train = training_data.drop(['NUM_SCHOOLS'], axis=1)
    y_train = training_data['NUM_SCHOOLS']

    X_test = testing_data.drop(['NUM_SCHOOLS'], axis=1)
    y_test = testing_data['NUM_SCHOOLS']
    
    if feature != 'all':
        X_train = X_train[[feature]]
        X_test = X_test[[feature]]
    # Perform Grid-Search
    gsc = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'max_depth': range(3,7),
            'n_estimators': (10, 50, 100, 1000),
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
    
    grid_result = gsc.fit(X_train, y_train)
    best_params = grid_result.best_params_
    
    rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"],                               random_state=False, verbose=False)
    rfr.fit(X_train,y_train)

    y_pred = rfr.predict(X_test)
    y_pred_round = np.round(y_pred)
    acc = accuracy_score(y_pred_round, y_test)
    print('Model accuracy:', acc)
    
    return rfr

In [28]:
def predict(model, X_predict):
    y_pred_null = model.predict(X_predict)
    return np.round(y_pred_null)

In [29]:
dataset_null = dataset_raw[dataset_raw.isnull().any(axis=1)]
len(dataset_null)

726

In [36]:
def save_prediction(model, level,feature):
    X_predict_1 = dataset_null.copy()
    X_predict_1 = X_predict_1.iloc[:, 84:]
    if level == 'elem':
        X_predict_1['ELEM_POPN'] = dataset_null['both_age_6_to_12']
        _zero_schools = zero_schools_elem.copy()
    elif level == 'hs':
        X_predict_1['HS_POPN'] = dataset_null['both_age_13_to_16']
        _zero_schools = zero_schools_hs.copy()
        
    X_predict_2 = _zero_schools.drop(['NUM_SCHOOLS','PSGC_BRGY'], axis=1)
        
    if feature == 'all':
        X_predict_1['Pred_school_num'] = predict(model, X_predict_1)
        X_predict_2['Pred_school_num'] = predict(model, X_predict_2)
    else:
        X_predict_1 = X_predict_1[[feature]]
        X_predict_2 = X_predict_2[[feature]]
        X_predict_1['Pred_school_num'] = predict(model, X_predict_1)
        X_predict_2['Pred_school_num'] = predict(model, X_predict_2)
        
    X_predict_1['PSGC_BRGY'] = dataset_null['PSGC_BRGY']
    X_predict_2['PSGC_BRGY'] = _zero_schools['PSGC_BRGY']
    
    predicted_df = pd.concat([X_predict_1, X_predict_2])
    
    print('Number of barangays predicted:', len(predicted_df))
    return predicted_df

In [38]:
elem_model = rfr_model(data_elem, 'ELEM_POPN')
elem_df_pop = save_prediction(elem_model, 'elem', 'ELEM_POPN')
elem_df_pop.to_csv(MODEL_OUTPUT_PATH + 'Model_RF_Output_Elem_pop.csv', index=False)

Model accuracy: 0.7976539589442815
Number of barangays predicted: 779


In [37]:
hs_model = rfr_model(data_hs, 'HS_POPN')
hs_df_pop = save_prediction(hs_model, 'hs', 'HS_POPN')
hs_df_pop.to_csv(MODEL_OUTPUT_PATH + 'Model_RF_Output_HS_pop.csv', index=False)

Model accuracy: 0.8513513513513513
Number of barangays predicted: 2116
