### Team : Uganda Bureau of Statistics (UBOS)
*** Ivan Atwiine *** Edgar Niyimpa *** Bob Tumushiime *** Bonita Natamba *** Alban Manishimwe

#### A Geospartial, AI and Data Driven Solution for Equitable Education through Bridging the Teacher to Pupil Gap in Uganda

**Abstract**: The goal of this project  is to determine the nearest education facilities for a given population. It will also leverage on usage of Artificial Intelligence to help policy makers in Uganda determine where to set up new schools or how to improve accessibility to education services.

In [59]:
#%pip install geopandas pygeos

In [60]:
#pip install scikit-learn

##### Let's install the required libraries that we are going to work with

In [61]:
import pandas as pd
import urllib.request
import geopandas as gpd
import zipfile

In [62]:
# We need to load the boundaries data for Uganda for us to be able to map schools data onto the map.
uganda_boundaries = gpd.read_file('uganda_boundaries_dataset/DISTRICTS_2018_UTM_36N.shp')

In [63]:
uganda_boundaries['geometry'] = uganda_boundaries['geometry'].centroid
uganda_boundaries['Longitude'] = uganda_boundaries['geometry'].x
uganda_boundaries['Latitude'] = uganda_boundaries['geometry'].y

In [64]:
uganda_boundaries.head()

Unnamed: 0,F15Regions,DName2016,DNama2017,dc2017,Male,Female,PopnRtn,TotalPopn,Popn,DName2018,geometry,Longitude,Latitude
0,SOUTH BUGANDA,MASAKA,MASAKA,105,145552,151452,96,297004,0,MASAKA,POINT (372569.060 -54007.263),372569.060097,-54007.263475
1,BUKEDI,PALLISA,BUTEBO,233,70352,74619,94,144971,0,BUTEBO,POINT (604311.840 132523.890),604311.840104,132523.889881
2,LANGO,ALEBTONG,ALEBTONG,323,110989,116552,95,227541,0,ALEBTONG,POINT (528350.940 250840.912),528350.940289,250840.911509
3,TESO,BUKEDEA,BUKEDEA,219,98684,104916,94,203600,0,BUKEDEA,POINT (624997.884 150903.413),624997.884423,150903.412822
4,BUKEDI,BUSIA,BUSIA,202,156447,167215,94,323662,0,BUSIA,POINT (611790.200 45829.827),611790.200208,45829.826598


In [65]:
uganda_boundaries.columns

Index(['F15Regions', 'DName2016', 'DNama2017', 'dc2017', 'Male', 'Female',
       'PopnRtn', 'TotalPopn', 'Popn', 'DName2018', 'geometry', 'Longitude',
       'Latitude'],
      dtype='object')

In [66]:
# lets drop some columns that may not be of interest to us:

uganda_boundaries.drop(['DNama2017', 'dc2017','Male','Female','PopnRtn','TotalPopn','Popn','DName2018','geometry'], axis=1, inplace=True)

In [67]:
# we need the regions and and districts and geometry
uganda_boundaries.head()

Unnamed: 0,F15Regions,DName2016,Longitude,Latitude
0,SOUTH BUGANDA,MASAKA,372569.060097,-54007.263475
1,BUKEDI,PALLISA,604311.840104,132523.889881
2,LANGO,ALEBTONG,528350.940289,250840.911509
3,TESO,BUKEDEA,624997.884423,150903.412822
4,BUKEDI,BUSIA,611790.200208,45829.826598


In [68]:
# lets rename the columns for the dataframe
uganda_boundaries.rename(columns={'F15Regions': 'Regions', 'DName2016': 'District'}, inplace=True)
uganda_boundaries.head()

Unnamed: 0,Regions,District,Longitude,Latitude
0,SOUTH BUGANDA,MASAKA,372569.060097,-54007.263475
1,BUKEDI,PALLISA,604311.840104,132523.889881
2,LANGO,ALEBTONG,528350.940289,250840.911509
3,TESO,BUKEDEA,624997.884423,150903.412822
4,BUKEDI,BUSIA,611790.200208,45829.826598


In [69]:
# lets load the data of teacher and we do some manipulation on it
teachers_data = pd.read_csv('teacher_and_pupils/teachers-2010-2015.csv')

In [70]:
teachers_data.head()

Unnamed: 0,Year,District ID,Name of district,Title of respondent,Type of school,Number of Male teachers,Number of female teachers,Number of teacher's who provide special needs education,Total Number of Teachers,Number of Absent Teachers
0,2010,105,Masaka,HEADTEACHER,Day,5.0,14.0,0.0,19.0,4.0
1,2010,105,Masaka,KIWEESA ANTHONY,Day,5.0,3.0,0.0,8.0,2.0
2,2010,105,Masaka,TEACHER,Day,8.0,3.0,0.0,11.0,8.0
3,2010,105,Masaka,SENIOR EDUC.ASSISTANT,Day,2.0,10.0,1.0,13.0,3.0
4,2010,105,Masaka,Deputy headteacher,Both,5.0,13.0,2.0,20.0,0.0


In [71]:
#lets drop the variables that do not need for our analysis
teachers_data.drop(['District ID','Title of respondent'], axis=1, inplace=True)

In [72]:
teachers_data.head()

Unnamed: 0,Year,Name of district,Type of school,Number of Male teachers,Number of female teachers,Number of teacher's who provide special needs education,Total Number of Teachers,Number of Absent Teachers
0,2010,Masaka,Day,5.0,14.0,0.0,19.0,4.0
1,2010,Masaka,Day,5.0,3.0,0.0,8.0,2.0
2,2010,Masaka,Day,8.0,3.0,0.0,11.0,8.0
3,2010,Masaka,Day,2.0,10.0,1.0,13.0,3.0
4,2010,Masaka,Both,5.0,13.0,2.0,20.0,0.0


In [73]:
#lets rename the District column since we shall be using it to join the datasets
teachers_data.rename(columns={'Name of district': 'District'}, inplace=True)

In [74]:
teachers_data['District']

0        Masaka
1        Masaka
2        Masaka
3        Masaka
4        Masaka
          ...  
11697    SHEEMA
11698    SHEEMA
11699    SHEEMA
11700    SHEEMA
11701    SHEEMA
Name: District, Length: 11702, dtype: object

In [75]:
# lets now load the pupils dataset
pupils_data = pd.read_csv('teacher_and_pupils/pupil-enrollment-2010-2015.csv')

In [76]:
pupils_data.head()

Unnamed: 0,Year,District ID,Name of district,Title of respondent,Type of school,Total P1 Boys Enrollment,Total P1 Girls Enrollment,Total P2 Boys Enrollment,Total P2 Girls Enrollment,Total P3 Boys Enrollment,...,Total P4 Girls Enrollment,Total P5 Boys Enrollment,Total P5 Girls Enrollment,Total P6 Boys Enrollment,Total P6 Girls Enrollment,Total P7 Boys Enrollment,Total P7 Girls Enrollment,Total Boys Enrollment,Total Girls Enrollment,Total Enrollment
0,2010,105,Masaka,HEADTEACHER,Day,58.0,60.0,42.0,51.0,55.0,...,62.0,45.0,61.0,47.0,49.0,14.0,40.0,317.0,383.0,700.0
1,2010,105,Masaka,KIWEESA ANTHONY,Day,26.0,30.0,45.0,36.0,46.0,...,48.0,33.0,41.0,37.0,32.0,25.0,20.0,258.0,247.0,505.0
2,2010,105,Masaka,TEACHER,Day,65.0,55.0,52.0,46.0,75.0,...,52.0,30.0,55.0,18.0,30.0,14.0,35.0,294.0,323.0,617.0
3,2010,105,Masaka,SENIOR EDUC.ASSISTANT,Day,62.0,76.0,61.0,48.0,48.0,...,52.0,47.0,50.0,40.0,45.0,22.0,25.0,332.0,350.0,682.0
4,2010,105,Masaka,Deputy headteacher,Both,125.0,116.0,52.0,55.0,49.0,...,56.0,44.0,52.0,40.0,50.0,36.0,43.0,412.0,425.0,827.0


In [77]:
pupils_data.columns

Index(['Year', 'District ID', 'Name of district', 'Title of respondent',
       'Type of school', 'Total P1 Boys Enrollment',
       'Total P1 Girls Enrollment', 'Total P2 Boys Enrollment',
       'Total P2 Girls Enrollment', 'Total P3 Boys Enrollment',
       'Total P3 Girls Enrollment', 'Total P4 Boys Enrollment',
       'Total P4 Girls Enrollment', 'Total P5 Boys Enrollment',
       'Total P5 Girls Enrollment', 'Total P6 Boys Enrollment',
       'Total P6 Girls Enrollment', 'Total P7 Boys Enrollment',
       'Total P7 Girls Enrollment', 'Total Boys Enrollment',
       'Total Girls Enrollment', 'Total Enrollment'],
      dtype='object')

In [78]:
# we need to drop the columns that we may not need to use
pupils_data.drop(['District ID', 'Title of respondent',
       'Type of school', 'Total P1 Boys Enrollment',
       'Total P1 Girls Enrollment', 'Total P2 Boys Enrollment',
       'Total P2 Girls Enrollment', 'Total P3 Boys Enrollment',
       'Total P3 Girls Enrollment', 'Total P4 Boys Enrollment',
       'Total P4 Girls Enrollment', 'Total P5 Boys Enrollment',
       'Total P5 Girls Enrollment', 'Total P6 Boys Enrollment',
       'Total P6 Girls Enrollment', 'Total P7 Boys Enrollment',
       'Total P7 Girls Enrollment',], axis=1, inplace=True)


In [79]:
# lets rename the dictrict name for our data
pupils_data.rename(columns={'Name of district': 'District'}, inplace=True)

In [80]:
pupils_data.columns

Index(['Year', 'District', 'Total Boys Enrollment', 'Total Girls Enrollment',
       'Total Enrollment'],
      dtype='object')

In [81]:
# let merge the schools and pupils dataset unto the boundaries

In [82]:
dataset = pd.merge(pupils_data,teachers_data, how="inner", on=['District','Year'])

In [83]:
final_dataset = uganda_boundaries.merge(dataset, on='District')

In [84]:
#final_dataset.to_csv('final_dataset.csv')

In [85]:
final_dataset.columns

Index(['Regions', 'District', 'Longitude', 'Latitude', 'Year',
       'Total Boys Enrollment', 'Total Girls Enrollment', 'Total Enrollment',
       'Type of school', ' Number of Male teachers',
       'Number of female teachers',
       ' Number of teacher's who provide special needs education',
       'Total Number of Teachers', 'Number of Absent Teachers'],
      dtype='object')

In [86]:
# lets deal with null values in our dataset
final_dataset.columns

Index(['Regions', 'District', 'Longitude', 'Latitude', 'Year',
       'Total Boys Enrollment', 'Total Girls Enrollment', 'Total Enrollment',
       'Type of school', ' Number of Male teachers',
       'Number of female teachers',
       ' Number of teacher's who provide special needs education',
       'Total Number of Teachers', 'Number of Absent Teachers'],
      dtype='object')

In [87]:
# we shall fill the missing values of the type of school with mode
final_dataset['Type of school'].mode()


0    Mixed day
Name: Type of school, dtype: object

In [88]:
# fill the missing values with mode
final_dataset['Type of school'].fillna('Mixed day', inplace=True)

In [89]:
 final_dataset[' Number of Male teachers'].mean()

7.725836554820307

In [90]:
# lets fill the null value with mean
final_dataset[' Number of Male teachers'].fillna(8, inplace=True)


In [91]:
 final_dataset['Number of female teachers'].mean()

5.728683813939341

In [92]:
final_dataset['Number of female teachers'].fillna(6, inplace=True)

In [93]:
 final_dataset[" Number of teacher's who provide special needs education"].mean()

0.7442544176595206

In [94]:
final_dataset[" Number of teacher's who provide special needs education"].fillna(1, inplace = True)

In [95]:
 final_dataset[" Number of teacher's who provide special needs education"].mean()

0.8137194477310455

In [96]:
final_dataset.isnull().sum()

Regions                                                          0
District                                                         0
Longitude                                                        0
Latitude                                                         0
Year                                                             0
Total Boys Enrollment                                            0
Total Girls Enrollment                                           0
Total Enrollment                                                 0
Type of school                                                   0
 Number of Male teachers                                         0
Number of female teachers                                        0
 Number of teacher's who provide special needs education         0
Total Number of Teachers                                      4473
Number of Absent Teachers                                   222283
dtype: int64

In [97]:
 final_dataset["Total Number of Teachers"].mean()

13.429502777650246

In [98]:
final_dataset["Total Number of Teachers"].fillna(1, inplace = True)

In [99]:
 final_dataset["Number of Absent Teachers"].fillna(0, inplace=True)

In [100]:
 # our dataset now has no null values
 final_dataset.isnull().sum()

Regions                                                     0
District                                                    0
Longitude                                                   0
Latitude                                                    0
Year                                                        0
Total Boys Enrollment                                       0
Total Girls Enrollment                                      0
Total Enrollment                                            0
Type of school                                              0
 Number of Male teachers                                    0
Number of female teachers                                   0
 Number of teacher's who provide special needs education    0
Total Number of Teachers                                    0
Number of Absent Teachers                                   0
dtype: int64

#### Model Generation

In [101]:
# We begin by importing necessary libraries for the model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt



In [102]:
le = LabelEncoder()
final_dataset['Regions'] = le.fit_transform(final_dataset['Regions'])
final_dataset['District'] = le.fit_transform(final_dataset['District'])
final_dataset['Type of school'] = le.fit_transform(final_dataset['Type of school'])

In [103]:
X = final_dataset[['District','Regions','Total Enrollment', 'Type of school']]
y = final_dataset['Total Number of Teachers']

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [105]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [106]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [107]:
model.fit(X_train, y_train)

In [108]:
y_pred = model.predict(X_test)


In [109]:
mse = mean_squared_error(y_test, y_pred)

In [110]:
rmse = sqrt(mse)




In [111]:
r2 = r2_score(y_test, y_pred)

In [112]:
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')

MSE: 32.76727793162045
RMSE: 5.724270951974622
R-squared: 0.2509569201059709


### After Evaluation of Linear and Random forest models:
### Linear Regression perfoms better

In [113]:
# Lets save the model, so that we use it later
import joblib # We use joblib to save the model

joblib.dump(model, 'model.pkl')

['model.pkl']

In [114]:
model = joblib.load('model.pkl')

In [115]:
model_columns = list(X_train.columns)
joblib.dump(model_columns, 'model_columns.pkl')
print("Models columns dumped!")

Models columns dumped!


In [116]:
X_train.columns

Index(['District', 'Regions', 'Total Enrollment', 'Type of school'], dtype='object')