In [None]:
# Code snippet 0
# Installing AI based ML package
!pip install -U --pre pycaret

In [None]:
# Code snippet 1
# Importing the basic Data Science packages
import numpy as np
import pandas as pd

In [None]:
# Code snippet 2
# Reading the file from the Web dataserver URL
url = 'https://raw.githubusercontent.com/nvamsimohan/DallasDSA/main/2021%20Texas%20High%20Schools%20Scores.csv'

In [None]:
# Code snippet 3
# Using Pandas package to read the csv file
df = pd.read_csv(url)

In [None]:
# Code snippet 4
# Displaying the data size
df.shape

(1821, 25)

In [None]:
# Code snippet 4.1
# Displaying the column names
df.columns

Index(['School', 'School URL', 'District', 'District URL', 'City', 'City URL',
       'Zip', 'County', 'Phone', 'Is Title I', 'Is Charter', 'Is Magnet',
       'Is Virtual', 'Number Students', 'Number Full-time Teachers',
       'Student/Teacher Ratio', 'Percent Free/Disc Lunch',
       'Percent Homes Rented', 'Percent of  Unemployment',
       'Percent Parents with Masters and above education',
       'Percent parents with less than High School education',
       'Percent Home Incomes with less than Median income',
       'Home Prices Ranking', 'Community Health Ranking', 'Standard Score'],
      dtype='object')

In [None]:
# Code snippet 5
# School name, URLs and phone features are ignored from analysis
# % Food stamps, %crime fields ignored from the analysis due
# to high missing proportion of data
cols = ['District', 'City', 
       'Zip', 'County', 'Is Title I', 'Is Charter', 'Is Magnet',
       'Is Virtual', 'Number Students', 'Number Full-time Teachers',
       'Student/Teacher Ratio', 'Percent Free/Disc Lunch',
       'Percent Homes Rented', 'Percent of  Unemployment',
       'Percent Parents with Masters and above education',
       'Percent parents with less than High School education',
       'Percent Home Incomes with less than Median income',
       'Home Prices Ranking', 'Community Health Ranking', 'Standard Score']
df1 = df[cols]
df1.columns

Index(['District', 'City', 'Zip', 'County', 'Is Title I', 'Is Charter',
       'Is Magnet', 'Is Virtual', 'Number Students',
       'Number Full-time Teachers', 'Student/Teacher Ratio',
       'Percent Free/Disc Lunch', 'Percent Homes Rented',
       'Percent of  Unemployment',
       'Percent Parents with Masters and above education',
       'Percent parents with less than High School education',
       'Percent Home Incomes with less than Median income',
       'Home Prices Ranking', 'Community Health Ranking', 'Standard Score'],
      dtype='object')

In [None]:
# Code snippet 6
# Importing all Regression ML libraries
from pycaret.regression import *

In [None]:
# Code snippet 7
# Setting all categorical features to one variable
cat_cols = ['District', 'City', 'Zip', 'County', 'Is Title I', 'Is Charter',
       'Is Magnet', 'Is Virtual']

In [None]:
# Code snippet 8
# Setting all numerical features to one variable
num_cols = ['Number Students',
       'Number Full-time Teachers', 'Student/Teacher Ratio',
       'Percent Free/Disc Lunch', 'Percent Homes Rented',
       'Percent of  Unemployment',
       'Percent Parents with Masters and above education',
       'Percent parents with less than High School education',
       'Percent Home Incomes with less than Median income',
       'Home Prices Ranking', 'Community Health Ranking']

In [None]:
# Code snippet 9
# Setting up the regression model using Pycaret - passing the categorical and numerical variables
regression_model_setup = setup(df1, target = 'Standard Score', categorical_features= cat_cols,
      numeric_features = num_cols)

In [None]:
# Code snippet 10
# Running the Pycaret AutoML process with a time-out parameter
compare_models(budget_time = 3)

In [None]:
# Code snippet 11
# Creating a ML model using the best algorithm
model = create_model('lightgbm')

INFO:logs:Initializing create_model()
INFO:logs:create_model(self=<pycaret.regression.oop.RegressionExperiment object at 0x7f2900262d10>, estimator=lightgbm, fold=None, round=4, cross_validation=True, predict=True, fit_kwargs=None, groups=None, refit=True, probability_threshold=None, experiment_custom_tags=None, verbose=True, system=True, add_to_model_list=True, metrics=None, display=None, model_only=True, return_train_score=False, kwargs={})
INFO:logs:Checking exceptions


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,9.2107,156.6041,12.5142,0.7679,0.5089,0.5757
1,9.3612,157.1181,12.5347,0.7356,0.5105,0.6365
2,7.8656,109.3776,10.4584,0.8139,0.4211,0.4152
3,9.9284,190.9528,13.8186,0.6882,0.4724,0.521
4,8.9097,134.868,11.6133,0.7815,0.3893,0.3504
5,11.1167,222.0023,14.8997,0.6681,0.4988,0.6144
6,10.4672,191.8099,13.8495,0.6721,0.5858,1.0126
7,9.4643,178.6248,13.3651,0.7184,0.4091,0.3822
8,10.7981,207.7324,14.4129,0.6485,0.488,0.6072
9,9.2482,187.6103,13.6971,0.6936,0.4242,0.2805


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

INFO:logs:Importing libraries
INFO:logs:Copying training dataset
INFO:logs:Defining folds
INFO:logs:Declaring metric variables
INFO:logs:Importing untrained model
INFO:logs:Light Gradient Boosting Machine Imported successfully
INFO:logs:Starting cross validation
INFO:logs:Cross validating with KFold(n_splits=10, random_state=None, shuffle=False), n_jobs=-1
INFO:logs:Calculating mean and std
INFO:logs:Creating metrics dataframe
INFO:logs:Finalizing model


INFO:logs:Uploading results into container
INFO:logs:Uploading model into container now
INFO:logs:master_model_container: 20
INFO:logs:display_container: 4
INFO:logs:LGBMRegressor(random_state=2960)
INFO:logs:create_model() successfully completed......................................


In [None]:
# Code snippet 12
# Obtaining the client data and saving it to a new pred_data dataframe
url1 = 'https://raw.githubusercontent.com/nvamsimohan/DallasDSA/main/2021%20-%20Texas%20High%20Schools%20Client%20-%20Dataset.csv'
pred_data = pd.read_csv(url1)
pred_data.columns

Index(['School', 'District', 'City', 'Zip', 'County', 'Phone', 'Is Title I',
       'Is Charter', 'Is Magnet', 'Is Virtual', 'Number Students',
       'Number Full-time Teachers', 'Student/Teacher Ratio',
       'Percent Free/Disc Lunch', 'Percent Homes Rented',
       'Percent of  Unemployment',
       'Percent Parents with Masters and above education',
       'Percent parents with less than High School education',
       'Percent Home Incomes with less than Median income',
       'Home Prices Ranking', 'Community Health Ranking', 'Standard Score'],
      dtype='object')

In [None]:
# Code snippet 13
# Selecting all the predictor columns from the client dataset
cols1 = ['School', 'District', 'City', 'Zip', 'County', 'Phone', 'Is Title I',
       'Is Charter', 'Is Magnet', 'Is Virtual', 'Number Students',
       'Number Full-time Teachers', 'Student/Teacher Ratio',
       'Percent Free/Disc Lunch', 'Percent Homes Rented',
       'Percent of  Unemployment',
       'Percent Parents with Masters and above education',
       'Percent parents with less than High School education',
       'Percent Home Incomes with less than Median income',
       'Home Prices Ranking', 'Community Health Ranking']
pred_data1 = pred_data[cols1]

In [None]:
# Code snippet 14
# Predicting the Standard Scores using the best ML model
predictions = predict_model(model, data = pred_data1)

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(self=<pycaret.regression.oop.RegressionExperiment object at 0x7f2900262d10>, estimator=LGBMRegressor(random_state=2960), probability_threshold=None, encoded_labels=False, raw_score=False, drift_report=False, round=4, verbose=True, ml_usecase=None, preprocess=True, replace_labels_in_column=<function _SupervisedExperiment.predict_model.<locals>.replace_labels_in_column at 0x7f29091cd5f0>)
INFO:logs:Checking exceptions
INFO:logs:Preloading libraries


INFO:logs:Set up data.


In [None]:
# Code snippet 15
# Converting the predictions to a csv file to send it to client
predictions.to_csv("TX School Standard Score Predictions.csv")