# Model prep

This notebook prepares some of the data for modelling.

# Imports

## Modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

## Data

In [6]:
broadband_assessment = pd.read_pickle('../data/prepped/combined_broadband_assessment_2018.pkl')
broadband_assessment.head()

Unnamed: 0_level_0,rla_score,math_score,DP02_0152PE,DP02_0151PE
leaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100005,38,45,75.4,83.4
100006,36,43,72.9,80.8
100007,65,70,91.1,95.5
100008,74,76,89.9,96.1
100011,41,40,76.7,88.0


In [7]:
broadband_assessment.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10211 entries, 0100005 to 5606240
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   rla_score    10211 non-null  int64 
 1   math_score   10211 non-null  int64 
 2   DP02_0152PE  10211 non-null  object
 3   DP02_0151PE  10211 non-null  object
dtypes: int64(2), object(2)
memory usage: 398.9+ KB


In [18]:
poverty_2018 = pd.read_pickle('../data/poverty_2018.pkl')
poverty_2018.rename(columns={'SAEPOVRAT5_17RV_PT':'poverty_ratio'}, inplace=True)
poverty_2018.index.rename('leaid', inplace=True)
poverty_2018.head()


Unnamed: 0_level_0,poverty_ratio
leaid,Unnamed: 1_level_1
100001,12.4
100003,15.2
100005,30.1
100006,26.6
100007,7.3


In [19]:
poverty_2018 = pd.to_numeric(poverty_2018['poverty_ratio'])

In [20]:
bpa_2018 = pd.merge(broadband_assessment, poverty_2018, on='leaid', how='inner')
bpa_2018.rename(columns={'DP02_0151PE':'comp_pct', 'DP02_0152PE':'broadband_pct'},
                inplace=True)
bpa_2018.head()

Unnamed: 0_level_0,rla_score,math_score,broadband_pct,comp_pct,poverty_ratio
leaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100005,38,45,75.4,83.4,30.1
100006,36,43,72.9,80.8,26.6
100007,65,70,91.1,95.5,7.3
100008,74,76,89.9,96.1,8.3
100011,41,40,76.7,88.0,16.9


In [8]:
bpa_2018.to_pickle('../data/full_working_set.pkl')

In [27]:
bpa_2018.info()
## bpa = broadband, poverty, assessment

<class 'pandas.core.frame.DataFrame'>
Index: 10203 entries, 0100005 to 5606240
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   rla_score        10203 non-null  int64  
 1   math_score       10203 non-null  int64  
 2   broadband_pct    10203 non-null  float64
 3   comp_pct         10203 non-null  float64
 4   poverty_ratio    10203 non-null  float64
 5   composite_score  10203 non-null  float64
dtypes: float64(4), int64(2)
memory usage: 558.0+ KB


In [25]:
bpa_2018['comp_pct'] = pd.to_numeric(bpa_2018['comp_pct'],downcast='integer')
bpa_2018['broadband_pct'] = pd.to_numeric(bpa_2018['broadband_pct'], downcast='integer')

In [26]:
bpa_2018['composite_score'] = (bpa_2018['rla_score'] + bpa_2018['math_score'])/2
bpa_2018.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10203 entries, 0100005 to 5606240
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   rla_score        10203 non-null  int64  
 1   math_score       10203 non-null  int64  
 2   broadband_pct    10203 non-null  float64
 3   comp_pct         10203 non-null  float64
 4   poverty_ratio    10203 non-null  float64
 5   composite_score  10203 non-null  float64
dtypes: float64(4), int64(2)
memory usage: 558.0+ KB


In [None]:
digital_divide = train_set.copy()

In [None]:
divide_labels = digital_divide['composite_score']
divide = digital_divide.drop(['rla_score','math_score','composite_score'],axis=1)

In [None]:
num_pipeline = Pipeline([
    ##Eventually add imputer
    ##Eventually add feature engineering functions
    ('std_scaler', StandardScaler())
])

In [None]:
num_attribs = list(divide)
## cat_attibs = []

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs)
    ##('cat', cat_pipeline[OneHotEncoder()], cat_attribs)
])

In [None]:
divide_prepped = full_pipeline.fit_transform(divide)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(divide_prepped, divide_labels)

In [None]:
divide_predictions = lin_reg.predict(divide_prepped)
print(divide_predictions)
print(divide_labels)
error = mean_squared_error(divide_labels, divide_predictions)

In [None]:
pre_err = np.sqrt(error)
RMSE = pre_err

In [None]:
print(RMSE)

In [None]:
state_dfs = []

for state in state_list['state']:  
    state = str(state)
    get_acs_data = requests.get('https://api.census.gov/data/{year}/acs/acs5/profile?get=group({group})&for={geofor}:*&in=state:{stateID}&key={key}'
                                .format(year=year, group=group, geofor=geofor, stateID=state, key=my_key))
    acs_content = json.loads(get_acs_data.content)
    state_census_info = pd.DataFrame(data=acs_content)
    state_census_info.columns = state_census_info.iloc[0]
    state_census_info = state_census_info[1:]
    state_census_info['GEO_ID'] = state_census_info['GEO_ID'].str.lstrip('9700000US').str.zfill(7)
    state_census_info.rename(columns={'GEO_ID':'leaid'}, inplace=True)
    state_dfs.append(state_census_info) 
    
all_census_info = pd.concat(state_dfs)
print(all_census_info.info())
print(all_census_info.head())
print(all_census_info.tail())

In [1]:
## COLUMNS to keep from cenSus_combined_assessment_2018.pkl:

question_numbers = ['DP05_0037PE', 'DP02_0066PE', 'DP03_0062E', 'DP04_0134E','DP03_0009PE','DP02_0112PE', 'DP02_0071PE']

In [None]:
census_combined_assessment = pd.read_pickle('census_combined_assessment')