In [1]:
#import decision regression tools:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, text
import plotly.express as px
import config

In [2]:
# create database engine
db_url = f"postgresql://{config.DB_USERNAME}:{config.DB_PASSWORD}@{config.DB_HOST}/{config.DB_NAME}"
engine = create_engine(db_url)

In [3]:
# load all the data into a datafame by joining the tables
query = text('''
Select * 
FROM "RawRecords" as r
JOIN "CalculatedStats" AS cs ON cs."RecordId"=r."Id";
''')
df = pd.read_sql_query(query,con=engine)
df.set_index('Id', inplace=True)
df

Unnamed: 0_level_0,State,Year,Enrolled,TotalRevenue,FederalRevenue,StateRevenue,LocalRevenue,TotalExpenditure,InstructionExpenditure,SupportServicesExpenditure,...,AvgMath4Score,AvgMath8Score,AvgReading4Score,AvgReading8Score,RecordId,FederalFundingPercent,StateFundingPercent,LocalFundingPercent,RevenuePerStudent,InstructionalExpensePercent
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003_ALABAMA,ALABAMA,2003,727900.0,5196054.0,567704.0,2966981.0,1661369.0,5298932.0,2817111.0,1521462.0,...,223.0,262.0,207.0,253.0,2003_ALABAMA,0.109257,0.571007,0.319737,7.138417,0.542164
2003_ALASKA,ALASKA,2003,133303.0,1425948.0,259423.0,813371.0,353154.0,1610289.0,763525.0,514052.0,...,233.0,279.0,212.0,256.0,2003_ALASKA,0.181930,0.570407,0.247663,10.697044,0.535451
2003_ARIZONA,ARIZONA,2003,875111.0,6529894.0,740579.0,2912629.0,2876686.0,6210287.0,2810907.0,1964229.0,...,229.0,271.0,209.0,255.0,2003_ARIZONA,0.113414,0.446045,0.440541,7.461789,0.430467
2003_ARKANSAS,ARKANSAS,2003,450158.0,3241275.0,379947.0,2394336.0,466992.0,3242799.0,1768713.0,972598.0,...,229.0,266.0,214.0,258.0,2003_ARKANSAS,0.117221,0.738702,0.144077,7.200305,0.545684
2003_CALIFORNIA,CALIFORNIA,2003,6226552.0,59815855.0,5795655.0,33617766.0,20402434.0,59749885.0,29561563.0,17030335.0,...,227.0,267.0,206.0,251.0,2003_CALIFORNIA,0.096892,0.562021,0.341087,9.606578,0.494209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015_VIRGINIA,VIRGINIA,2015,1279867.0,15857524.0,1012205.0,6240349.0,8604970.0,16113212.0,8755896.0,5075509.0,...,247.0,288.0,229.0,267.0,2015_VIRGINIA,0.063831,0.393526,0.542643,12.389978,0.552160
2015_WASHINGTON,WASHINGTON,2015,1072359.0,13709442.0,1036422.0,8293812.0,4379208.0,13630138.0,6508964.0,4510672.0,...,245.0,287.0,226.0,267.0,2015_WASHINGTON,0.075599,0.604971,0.319430,12.784377,0.474780
2015_WEST_VIRGINIA,WEST_VIRGINIA,2015,279565.0,3478401.0,362959.0,1979466.0,1135976.0,3466981.0,1819903.0,1161944.0,...,235.0,271.0,216.0,260.0,2015_WEST_VIRGINIA,0.104347,0.569074,0.326580,12.442191,0.523201
2015_WISCONSIN,WISCONSIN,2015,861813.0,11637376.0,814385.0,5869265.0,4953726.0,11553677.0,5723474.0,3691809.0,...,243.0,289.0,223.0,270.0,2015_WISCONSIN,0.069980,0.504346,0.425674,13.503366,0.491818


## Tweaking Database

In [4]:
model_df = df[[
    "AvgMath4Score",
    "AvgMath8Score",
    "AvgReading4Score",
    "AvgReading8Score",
    "FederalFundingPercent",
    "StateFundingPercent",
    "LocalFundingPercent",
    "RevenuePerStudent",
    "InstructionalExpensePercent"
]]
model_df.index = df.index
model_df

Unnamed: 0_level_0,AvgMath4Score,AvgMath8Score,AvgReading4Score,AvgReading8Score,FederalFundingPercent,StateFundingPercent,LocalFundingPercent,RevenuePerStudent,InstructionalExpensePercent
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2003_ALABAMA,223.0,262.0,207.0,253.0,0.109257,0.571007,0.319737,7.138417,0.542164
2003_ALASKA,233.0,279.0,212.0,256.0,0.181930,0.570407,0.247663,10.697044,0.535451
2003_ARIZONA,229.0,271.0,209.0,255.0,0.113414,0.446045,0.440541,7.461789,0.430467
2003_ARKANSAS,229.0,266.0,214.0,258.0,0.117221,0.738702,0.144077,7.200305,0.545684
2003_CALIFORNIA,227.0,267.0,206.0,251.0,0.096892,0.562021,0.341087,9.606578,0.494209
...,...,...,...,...,...,...,...,...,...
2015_VIRGINIA,247.0,288.0,229.0,267.0,0.063831,0.393526,0.542643,12.389978,0.552160
2015_WASHINGTON,245.0,287.0,226.0,267.0,0.075599,0.604971,0.319430,12.784377,0.474780
2015_WEST_VIRGINIA,235.0,271.0,216.0,260.0,0.104347,0.569074,0.326580,12.442191,0.523201
2015_WISCONSIN,243.0,289.0,223.0,270.0,0.069980,0.504346,0.425674,13.503366,0.491818


In [5]:
#dropping DISTRICT_OF_COLUMBIA data
df = df.loc[df['State'] != 'DISTRICT_OF_COLUMBIA']
df.shape

(350, 22)

In [6]:
# combining grade scores
Avg_Score = (df.AvgMath4Score + df.AvgMath8Score + df.AvgReading4Score + df.AvgReading8Score)/4
df['Avg Score'] = Avg_Score
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Avg Score'] = Avg_Score


Unnamed: 0_level_0,State,Year,Enrolled,TotalRevenue,FederalRevenue,StateRevenue,LocalRevenue,TotalExpenditure,InstructionExpenditure,SupportServicesExpenditure,...,AvgMath8Score,AvgReading4Score,AvgReading8Score,RecordId,FederalFundingPercent,StateFundingPercent,LocalFundingPercent,RevenuePerStudent,InstructionalExpensePercent,Avg Score
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003_ALABAMA,ALABAMA,2003,727900.0,5196054.0,567704.0,2966981.0,1661369.0,5298932.0,2817111.0,1521462.0,...,262.0,207.0,253.0,2003_ALABAMA,0.109257,0.571007,0.319737,7.138417,0.542164,236.25
2003_ALASKA,ALASKA,2003,133303.0,1425948.0,259423.0,813371.0,353154.0,1610289.0,763525.0,514052.0,...,279.0,212.0,256.0,2003_ALASKA,0.18193,0.570407,0.247663,10.697044,0.535451,245.0
2003_ARIZONA,ARIZONA,2003,875111.0,6529894.0,740579.0,2912629.0,2876686.0,6210287.0,2810907.0,1964229.0,...,271.0,209.0,255.0,2003_ARIZONA,0.113414,0.446045,0.440541,7.461789,0.430467,241.0
2003_ARKANSAS,ARKANSAS,2003,450158.0,3241275.0,379947.0,2394336.0,466992.0,3242799.0,1768713.0,972598.0,...,266.0,214.0,258.0,2003_ARKANSAS,0.117221,0.738702,0.144077,7.200305,0.545684,241.75
2003_CALIFORNIA,CALIFORNIA,2003,6226552.0,59815855.0,5795655.0,33617766.0,20402434.0,59749885.0,29561563.0,17030335.0,...,267.0,206.0,251.0,2003_CALIFORNIA,0.096892,0.562021,0.341087,9.606578,0.494209,237.75


## Logistic Regression

In [None]:
#scoreing breakdown on NAEP website
#https://nces.ed.gov/nationsreportcard/mathematics/achieve.aspx#grade8