In [25]:
import pandas as pd
import numpy as np

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy import create_engine, text
import scipy.stats as stats

In [2]:
import config

In [3]:
# create database engine
db_url = f"postgresql://{config.DB_USERNAME}:{config.DB_PASSWORD}@{config.DB_HOST}/{config.DB_NAME}"
engine = create_engine(db_url)

In [5]:
# load all the data into a datafame by joining the tables
query = text('''
Select * 
FROM "RawRecords" as r
JOIN "CalculatedStats" AS cs ON cs."RecordId"=r."Id"
JOIN "Clusters" as c ON c."RecordId"=r."Id";
''')
df = pd.read_sql_query(query,con=engine)
df.set_index('Id', inplace=True)
df

Unnamed: 0_level_0,State,Year,Enrolled,TotalRevenue,FederalRevenue,StateRevenue,LocalRevenue,TotalExpenditure,InstructionExpenditure,SupportServicesExpenditure,...,AvgReading4Score,AvgReading8Score,RecordId,FederalFundingPercent,StateFundingPercent,LocalFundingPercent,RevenuePerStudent,InstructionalExpensePercent,RecordId,Cluster
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003_ALABAMA,ALABAMA,2003,727900.0,5196054.0,567704.0,2966981.0,1661369.0,5298932.0,2817111.0,1521462.0,...,207.0,253.0,2003_ALABAMA,0.109257,0.571007,0.319737,7.138417,0.542164,2003_ALABAMA,0
2003_ALASKA,ALASKA,2003,133303.0,1425948.0,259423.0,813371.0,353154.0,1610289.0,763525.0,514052.0,...,212.0,256.0,2003_ALASKA,0.181930,0.570407,0.247663,10.697044,0.535451,2003_ALASKA,0
2003_ARIZONA,ARIZONA,2003,875111.0,6529894.0,740579.0,2912629.0,2876686.0,6210287.0,2810907.0,1964229.0,...,209.0,255.0,2003_ARIZONA,0.113414,0.446045,0.440541,7.461789,0.430467,2003_ARIZONA,0
2003_ARKANSAS,ARKANSAS,2003,450158.0,3241275.0,379947.0,2394336.0,466992.0,3242799.0,1768713.0,972598.0,...,214.0,258.0,2003_ARKANSAS,0.117221,0.738702,0.144077,7.200305,0.545684,2003_ARKANSAS,0
2003_CALIFORNIA,CALIFORNIA,2003,6226552.0,59815855.0,5795655.0,33617766.0,20402434.0,59749885.0,29561563.0,17030335.0,...,206.0,251.0,2003_CALIFORNIA,0.096892,0.562021,0.341087,9.606578,0.494209,2003_CALIFORNIA,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015_VIRGINIA,VIRGINIA,2015,1279867.0,15857524.0,1012205.0,6240349.0,8604970.0,16113212.0,8755896.0,5075509.0,...,229.0,267.0,2015_VIRGINIA,0.063831,0.393526,0.542643,12.389978,0.552160,2015_VIRGINIA,1
2015_WASHINGTON,WASHINGTON,2015,1072359.0,13709442.0,1036422.0,8293812.0,4379208.0,13630138.0,6508964.0,4510672.0,...,226.0,267.0,2015_WASHINGTON,0.075599,0.604971,0.319430,12.784377,0.474780,2015_WASHINGTON,1
2015_WEST_VIRGINIA,WEST_VIRGINIA,2015,279565.0,3478401.0,362959.0,1979466.0,1135976.0,3466981.0,1819903.0,1161944.0,...,216.0,260.0,2015_WEST_VIRGINIA,0.104347,0.569074,0.326580,12.442191,0.523201,2015_WEST_VIRGINIA,0
2015_WISCONSIN,WISCONSIN,2015,861813.0,11637376.0,814385.0,5869265.0,4953726.0,11553677.0,5723474.0,3691809.0,...,223.0,270.0,2015_WISCONSIN,0.069980,0.504346,0.425674,13.503366,0.491818,2015_WISCONSIN,1


In [20]:
cluster0_df = df.loc[df["Cluster"] == 0]
cluster1_df = df.loc[df["Cluster"] == 1]
# cluster 2 is ignored as outliers

## Compare avg grades between clusters 0 and 1

In [32]:
columns_to_compare = ["AvgMath4Score", "AvgReading4Score", "AvgMath8Score", "AvgReading8Score"]
for column in columns_to_compare:
    print(column)
    list0 = cluster0_df[column].tolist()
    list1 = cluster1_df[column].tolist()
    print(f"variance ratio: {np.var(list0)/np.var(list1)}")
    print(f"cluster 0 mean: {np.mean(list0)}")
    print(f"cluster 1 mean: {np.mean(list1)}")
    print("2 sample t-test results:")
    print(stats.ttest_ind(a=list0, b=list1, equal_var=True))
    print()
    print("-----------------")

AvgMath4Score
variance ratio: 1.2939641278753753
cluster 0 mean: 234.22302158273382
cluster 1 mean: 242.67298578199052
2 sample t-test results:
Ttest_indResult(statistic=-18.544671863473976, pvalue=7.048788531323798e-54)

-----------------
AvgReading4Score
variance ratio: 1.7691882866862252
cluster 0 mean: 214.44604316546761
cluster 1 mean: 224.20853080568722
2 sample t-test results:
Ttest_indResult(statistic=-21.498738873040637, pvalue=7.796941059523068e-66)

-----------------
AvgMath8Score
variance ratio: 1.5130335082545012
cluster 0 mean: 274.7985611510791
cluster 1 mean: 285.92890995260666
2 sample t-test results:
Ttest_indResult(statistic=-19.467684631021278, pvalue=1.258692123072285e-57)

-----------------
AvgReading8Score
variance ratio: 1.7329445486128527
cluster 0 mean: 258.37410071942446
cluster 1 mean: 267.73933649289097
2 sample t-test results:
Ttest_indResult(statistic=-22.048554038809293, pvalue=4.858697135680823e-68)

-----------------


## Compare stats between clusters 0 and 1

In [33]:
columns_to_compare = ["FederalFundingPercent", "StateFundingPercent", "LocalFundingPercent", "RevenuePerStudent", "InstructionalExpensePercent"]
for column in columns_to_compare:
    print(column)
    list0 = cluster0_df[column].tolist()
    list1 = cluster1_df[column].tolist()
    print(f"variance ratio: {np.var(list0)/np.var(list1)}")
    print(f"cluster 0 mean: {np.mean(list0)}")
    print(f"cluster 1 mean: {np.mean(list1)}")
    print("2 sample t-test results:")
    print(stats.ttest_ind(a=list0, b=list1, equal_var=True))
    print()
    print("-----------------")

FederalFundingPercent
variance ratio: 0.906816247861474
cluster 0 mean: 0.11828289242207154
cluster 1 mean: 0.08433021168954731
2 sample t-test results:
Ttest_indResult(statistic=10.39999690199583, pvalue=3.075141243037007e-22)

-----------------
StateFundingPercent
variance ratio: 1.232472475018321
cluster 0 mean: 0.5587468950804527
cluster 1 mean: 0.451465313756531
2 sample t-test results:
Ttest_indResult(statistic=8.925962726263654, pvalue=2.581314163063687e-17)

-----------------
LocalFundingPercent
variance ratio: 1.2962791282648258
cluster 0 mean: 0.3229702124974758
cluster 1 mean: 0.46420447455392166
2 sample t-test results:
Ttest_indResult(statistic=-11.822056244407724, pvalue=2.4320236531847827e-27)

-----------------
RevenuePerStudent
variance ratio: 0.5116004758247668
cluster 0 mean: 10.079479937073806
cluster 1 mean: 13.090331002731636
2 sample t-test results:
Ttest_indResult(statistic=-8.715870324460882, pvalue=1.196194626412474e-16)

-----------------
InstructionalExpense