# Model prep

This notebook prepares some of the data for modelling.

# Imports

## Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

## Data

In [4]:
divide_data = pd.read_pickle('../data/CLEANED_df.pkl')
divide_data.head()

Unnamed: 0_level_0,reading_score,math_score,comp_pct,broadband_pct,poverty_ratio,white_pct,college_pct,median_income,median_rent,unemployment_pct,home_lang_not_eng,disability_pct,composite_score
LEAID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
102650,48,48,81.2,72.9,16.5,89.2,83.4,48161,750,7.8,3.0,18.4,48.0
102670,17,15,55.9,46.2,49.0,29.9,78.1,23561,574,12.3,1.4,25.2,16.0
102700,41,43,83.5,68.1,32.3,49.4,84.2,40218,825,9.2,4.9,16.6,42.0
102730,37,29,72.9,62.8,29.4,56.0,80.6,37458,456,10.6,4.3,24.5,33.0
102760,48,53,81.5,70.5,27.9,88.8,82.8,36306,492,7.3,1.4,23.7,50.5


In [5]:
divide_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9988 entries, 0102650 to 5502880
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   reading_score      9988 non-null   int64  
 1   math_score         9988 non-null   int64  
 2   comp_pct           9988 non-null   float64
 3   broadband_pct      9988 non-null   float64
 4   poverty_ratio      9988 non-null   float64
 5   white_pct          9988 non-null   float64
 6   college_pct        9988 non-null   float64
 7   median_income      9988 non-null   int64  
 8   median_rent        9988 non-null   int64  
 9   unemployment_pct   9988 non-null   float64
 10  home_lang_not_eng  9988 non-null   float64
 11  disability_pct     9988 non-null   float64
 12  composite_score    9988 non-null   float64
dtypes: float64(9), int64(4)
memory usage: 1.1+ MB


In [6]:
train_set, test_set = train_test_split(divide_data, test_size=0.2, random_state=31)

In [7]:
digital_divide = train_set.copy()

In [9]:
divide_labels = digital_divide['composite_score']
divide = digital_divide.drop(['reading_score','math_score','composite_score'],axis=1)

In [13]:
num_pipeline = Pipeline([
    ##Eventually add imputer
    ##Eventually add feature engineering functions
    ('std_scaler', StandardScaler())
])

In [14]:
num_attribs = list(divide)
## cat_attibs = []

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs)
    ##('cat', cat_pipeline[OneHotEncoder()], cat_attribs)
])

In [15]:
divide_prepped = full_pipeline.fit_transform(divide)

# Linear Regression

In [16]:
lin_reg = LinearRegression()
lin_reg.fit(divide_prepped, divide_labels)

LinearRegression()

In [17]:
divide_predictions = lin_reg.predict(divide_prepped)
print(divide_predictions)
print(divide_labels)
error = mean_squared_error(divide_labels, divide_predictions)
pre_err = np.sqrt(error)
RMSE = pre_err

[51.12871257 55.34207479 43.49517234 ... 49.80034032 52.40252435
 50.0721725 ]
LEAID
2623490    32.5
3621930    59.5
1302430    39.5
5401170    37.5
4900142    45.5
           ... 
3626430    53.0
0103450    41.5
0643560    42.0
2723550    65.0
3100122    57.5
Name: composite_score, Length: 7990, dtype: float64


In [19]:
print(RMSE)

12.858917422236514


# Decision Tree Regressor

In [22]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(divide_prepped, divide_labels)

DecisionTreeRegressor()

In [23]:
divide_predictions_tree = dt_reg.predict(divide_prepped)
print(divide_predictions_tree)
print(divide_labels)
error_tree = mean_squared_error(divide_labels, divide_predictions)
pre_err_tree = np.sqrt(error)
RMSE_tree = pre_err
print(RMSE_tree)

[32.5 59.5 39.5 ... 42.  65.  57.5]
LEAID
2623490    32.5
3621930    59.5
1302430    39.5
5401170    37.5
4900142    45.5
           ... 
3626430    53.0
0103450    41.5
0643560    42.0
2723550    65.0
3100122    57.5
Name: composite_score, Length: 7990, dtype: float64
12.858917422236514


# Random Forest Regressor

In [25]:
forest_reg = RandomForestRegressor()
forest_reg.fit(divide_prepped, divide_labels)

RandomForestRegressor()

In [26]:
prediction_forest = forest_reg.predict(divide_prepped)
print(prediction_forest)
print(divide_labels)
error_forest = mean_squared_error(divide_labels, prediction_forest)
RMSE_forest = np.sqrt(error_forest)
print(RMSE_forest)

[39.125 58.485 39.525 ... 43.335 60.025 53.995]
LEAID
2623490    32.5
3621930    59.5
1302430    39.5
5401170    37.5
4900142    45.5
           ... 
3626430    53.0
0103450    41.5
0643560    42.0
2723550    65.0
3100122    57.5
Name: composite_score, Length: 7990, dtype: float64
4.760302655342733


In [None]:
for feature, score in 

# CART Algorithm

# Model Evaluation