In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 40

In [2]:
!ls "raw data"

ACC HS Chronic Absenteeism.xlsx
ACC HS Graduation Rate.xlsx
Annual Regents Exams.xlsx
GRAD_RATE_AND_OUTCOMES_2022.csv
nys-ann_regents-reduc.csv


### Reading in Datasets

In [3]:
nys_absenteeism_df = pd.read_excel("raw data/ACC HS Chronic Absenteeism.xlsx",index_col=False, 
                                   dtype=({'ENTITY_CD':'string'}))

In [4]:
nys_annual_regents = pd.read_csv("raw data/nys-ann_regents-reduc.csv",index_col=False,
                                dtype=({'ENTITY_CD':'string','INSTITUTION_ID':'string'}))

In [3]:
nys_grad_rate_df = pd.read_excel("raw data/ACC HS Graduation Rate.xlsx",index_col=False,dtype=({'ENTITY_CD':'string'}))

### Filtering & Cleaning Datasets

In [6]:
# looking at the columns in nys_annual_regents csv 
nys_annual_regents.columns

Index(['institution_id', 'entity_cd', 'entity_name', 'year', 'subject',
       'subgroup_name', 'tested', 'num_level1', 'per_level1', 'num_level2',
       'per_level2', 'num_level3', 'per_level3', 'num_level4', 'per_level4',
       'num_level5', 'per_level5', 'num_prof', 'per_prof', 'total_exempt',
       'num_exempt_ntest', 'pct_exempt_ntest', 'num_exempt_test',
       'pct_exempt_test', 'assmnt_flag'],
      dtype='object')

In [7]:
# filtering our dataframe so that we only have records from the 2021-2022 AY
# also not including 'All Students' as an aggregate measure
nys_annual_regents = nys_annual_regents[(nys_annual_regents.year == 2022) 
                                        & (nys_annual_regents.subgroup_name != 'All Students')]

In [23]:
# for simplicity, we're interested in whether students in a given subgroup were proficient in a given subject 
ann_regs_cols = ['entity_cd','entity_name','year','subject','subgroup_name',
                 'tested','num_prof']
nys_annual_regents = nys_annual_regents.loc[:,ann_regs_cols]

In [24]:
# we also want to exclude records that might have 's' in 'num_prof' as these indicate records
# containing less than 5 students per subgroup 
nys_annual_regents = nys_annual_regents[nys_annual_regents.num_prof.str.contains('s')==False]

Now we must do that with our `grad-rate` dataset

In [15]:
nys_grad_rate_df.shape

(94896, 6)

In [5]:
# inspecting the head 
# very messy, need to normalize column names, and filter out desired results
nys_grad_rate_df.head()

Unnamed: 0,INSTITUTION_ID,ENTITY_CD,ENTITY_NAME,YEAR,SUBGROUP_NAME,COHORT,COHORT_COUNT,GRAD_COUNT,GRAD_RATE,COHORT_LEVEL,OVERRIDE,WT_PERF_FLAG
0,800000055729,10100010000,ALBANY CITY SD,2021,American Indian or Alaska Native,5-Year,2,s,s,,,
1,800000055729,10100010000,ALBANY CITY SD,2021,All Students,5-Year,697,517,74.2,,,
2,800000055729,10100010000,ALBANY CITY SD,2021,Black or African American,6-Year,351,244,69.5,,,
3,800000055729,10100010000,ALBANY CITY SD,2022,Economically Disadvantaged,4-Year,434,333,76.7,s,,N
4,800000055729,10100010000,ALBANY CITY SD,2022,Hispanic or Latino,5-Year,108,78,72.2,s,,N


Very messy. We only need a subset of columns. For this, we'll leverage the documentation provided to us by the database.

In [13]:
# transforming our grad rate dataset 

# normalizing column names 
nys_grad_rate_df.columns = nys_grad_rate_df.columns.str.lower()

# getting only columns of interest 
nys_grad_rate_df = nys_grad_rate_df.loc[:,['entity_cd','entity_name','year','subgroup_name','cohort','grad_rate']]

# removing all records that contain district-level aggregations 
nys_grad_rate_df = nys_grad_rate_df[~nys_grad_rate_df['entity_cd'].astype('str').str.endswith('0000')]

# removing all records containing aggregations on all public schools
nys_grad_rate_df = nys_grad_rate_df[~nys_grad_rate_df['entity_cd'].astype('str').str.endswith('111111111111')]


# removing all records containing combined aggregate measures for all students 
nys_grad_rate_df = nys_grad_rate_df[nys_grad_rate_df.subgroup_name != 'All Students']

# normalizing entity_name 
nys_grad_rate_df.entity_name = nys_grad_rate_df.entity_name.str.title()

# reseting our index 
nys_grad_rate_df = nys_grad_rate_df.reset_index(drop=True)

In [27]:
# we now have a new dataframe containing a lot of information that we still need to further reduce 
nys_grad_rate_df.head()

Unnamed: 0,entity_cd,entity_name,year,subgroup_name,cohort,grad_rate
0,10100010034,Albany High School,2022,Hispanic or Latino,6-Year,75.9
1,10100010034,Albany High School,2022,English Language Learner,4-Year,78.7
2,10100010034,Albany High School,2022,Multiracial,6-Year,s
3,10100010034,Albany High School,2022,Hispanic or Latino,Combined,73.7
4,10100010034,Albany High School,2022,White,4-Year,86.6


In [16]:
# creating a new dataframe 
# this contains graduation_rate records for the Combined cohorts in the year 2022
# also want to drop records where grad_rate is == 's' as these are restricted for privacy reasons
nys_grad_rate_22 = nys_grad_rate_df[(nys_grad_rate_df.cohort == 'Combined') & (nys_grad_rate_df.year == 2022) 
                 & (nys_grad_rate_df.grad_rate != 's')]
nys_grad_rate_22 = nys_grad_rate_22.drop(columns=['cohort'])
nys_grad_rate_22 = nys_grad_rate_22.reset_index(drop=True)

In [29]:
# we now have a dataset containing information we want 
nys_grad_rate_22.head()

Unnamed: 0,entity_cd,entity_name,year,subgroup_name,grad_rate
0,10100010034,Albany High School,2022,Hispanic or Latino,73.7
1,10100010034,Albany High School,2022,White,86.9
2,10100010034,Albany High School,2022,Multiracial,77.4
3,10100010034,Albany High School,2022,English Language Learner,70.4
4,10100010034,Albany High School,2022,Economically Disadvantaged,75.5


Final dataframe to normalize. Need to transform `nys_absenteeism_df` containing information on chronic absenteeism so that we isolate records containing information on the 2021-2022 AY and are not aggregate measures for all students 

In [30]:
# looking at the first five records of the dataset 
nys_absenteeism_df.head()

Unnamed: 0,INSTITUTION_ID,ENTITY_CD,ENTITY_NAME,YEAR,SUBJECT,SUBGROUP_NAME,ENROLLMENT,ABSENT_COUNT,ABSENT_RATE,LEVEL,OVERRIDE,DATA_REP_FLAG,PARTIAL_DATA_FLAG
0,800000055729,10100010000,ALBANY CITY SD,2021,HS_CA,All Students,2755,1031,37.4,,,Y,
1,800000055729,10100010000,ALBANY CITY SD,2021,HS_CA,Students with Disabilities,415,197,47.5,,,Y,
2,800000055729,10100010000,ALBANY CITY SD,2021,HS_CA,American Indian or Alaska Native,9,s,s,,,Y,
3,800000055729,10100010000,ALBANY CITY SD,2021,HS_CA,Asian or Native Hawaiian/Other Pacific Islander,219,40,18.3,,,Y,
4,800000055729,10100010000,ALBANY CITY SD,2021,HS_CA,Black or African American,1398,624,44.6,,,Y,


In [39]:
# transforming our chronic absenteeism dataset

# normalizing column names 
nys_absenteeism_df.columns = nys_absenteeism_df.columns.str.lower()

# selecting columns of importance
nys_absenteeism_df = nys_absenteeism_df.loc[:,['entity_cd', 'entity_name','year','subgroup_name','absent_rate']]

# removing records that contain district-level aggregations 
nys_absenteeism_df = nys_absenteeism_df[~nys_absenteeism_df['entity_cd'].astype('str').str.endswith('0000')]

# filtering records so that we have info only on 2021-2022 AY
nys_absenteeism_df = nys_absenteeism_df[nys_absenteeism_df.year == 2022]

# removing records that are aggregations between all subgroups of students 
nys_absenteeism_df = nys_absenteeism_df[nys_absenteeism_df.subgroup_name != 'All Students']

# normalizing entity_name 
nys_absenteeism_df.entity_name = nys_absenteeism_df.entity_name.str.title()
nys_absenteeism_df = nys_absenteeism_df.reset_index(drop=True)
nys_absenteeism_df.head()

Unnamed: 0,entity_cd,entity_name,year,subgroup_name,absent_rate
0,10100010034,Albany High School,2022,Students with Disabilities,64.5
1,10100010034,Albany High School,2022,American Indian or Alaska Native,s
2,10100010034,Albany High School,2022,Asian or Native Hawaiian/Other Pacific Islander,41.8
3,10100010034,Albany High School,2022,Black or African American,67.8
4,10100010034,Albany High School,2022,Hispanic or Latino,68.3


## Cleaned Datasets

Our new datasets, after filtering and cleaning up a bit include: `nys_absenteeism_df`, `nys_grad_rate_22`, `nys_annual_regents`.

In [40]:
nys_grad_rate_22.head()

Unnamed: 0,entity_cd,entity_name,year,subgroup_name,grad_rate
0,10100010034,Albany High School,2022,Hispanic or Latino,73.7
1,10100010034,Albany High School,2022,White,86.9
2,10100010034,Albany High School,2022,Multiracial,77.4
3,10100010034,Albany High School,2022,English Language Learner,70.4
4,10100010034,Albany High School,2022,Economically Disadvantaged,75.5


In [41]:
# information on chronic absenteeism rates by subgroup for the 2021-2022 AY 
nys_absenteeism_df.head()

Unnamed: 0,entity_cd,entity_name,year,subgroup_name,absent_rate
0,10100010034,Albany High School,2022,Students with Disabilities,64.5
1,10100010034,Albany High School,2022,American Indian or Alaska Native,s
2,10100010034,Albany High School,2022,Asian or Native Hawaiian/Other Pacific Islander,41.8
3,10100010034,Albany High School,2022,Black or African American,67.8
4,10100010034,Albany High School,2022,Hispanic or Latino,68.3


In [45]:
nys_annual_regents.head()

Unnamed: 0,entity_cd,entity_name,year,subject,subgroup_name,tested,num_prof
234654,10100010030,William S Hackett Middle School,2022,Regents Common Core Algebra I,Asian or Native Hawaiian/Other Pacific Islander,17,14
234655,10100010030,William S Hackett Middle School,2022,Regents Living Environment,Asian or Native Hawaiian/Other Pacific Islander,14,14
234658,10100010030,William S Hackett Middle School,2022,Regents Common Core Algebra I,Economically Disadvantaged,23,20
234659,10100010030,William S Hackett Middle School,2022,Regents Living Environment,Economically Disadvantaged,25,21
234661,10100010030,William S Hackett Middle School,2022,Regents Common Core Algebra I,Female,33,32


Now, we need to merge the two columns to have a total of 4 shared columns and 5 features, thus having 9 columns in our final dataframe

## Other Datasets For Future Modeling 

In [22]:
# creating a new dataframe 
# this contains graduation_rate records for all cohorts that graduated in 2022
# also want to drop records where grad_rate is == 's' as these are restricted for privacy reasons
nys_grad_cohorts = nys_grad_rate_df[(nys_grad_rate_df.cohort != 'Combined') & (nys_grad_rate_df.year == 2022) 
                 & (nys_grad_rate_df.grad_rate != 's')]
nys_grad_cohorts = nys_grad_cohorts.reset_index(drop=True)

# exporting as csv to data folder 
# only run once 
#nys_grad_cohorts.to_csv('data/cohort_gradrates_22',index=False)

In [152]:
# extracting useful grouping information from another dataset 
# contains what district, county, NRC each school belongs to 
nys_cats = pd.read_csv('raw data/GRAD_RATE_AND_OUTCOMES_2022.csv',usecols=
            ['subgroup_name','aggregation_code','aggregation_name','lea_name','nrc_code','nrc_desc','county_name','county_code'],
                       dtype=str)

In [153]:
# removing records that contain aggregated records that aren't useful 
nys_cats = nys_cats[~nys_cats['aggregation_code'].astype('str').str.endswith('0000')]
nys_cats = nys_cats[~nys_cats['aggregation_code'].astype('str').str.endswith('0001')]
nys_cats = nys_cats[~nys_cats['aggregation_code'].astype('str').str.endswith('0002')]
nys_cats = nys_cats[~nys_cats['aggregation_code'].astype('str').str.endswith('0003')]
nys_cats = nys_cats[~nys_cats['aggregation_code'].astype('str').str.endswith('0004')]
nys_cats = nys_cats[~nys_cats['aggregation_code'].astype('str').str.endswith('0005')]
nys_cats = nys_cats[~nys_cats['aggregation_code'].astype('str').str.endswith('0006')]
nys_cats = nys_cats[~nys_cats['aggregation_code'].astype('str').str.endswith('0007')]

# renaming columns 
nys_cats = nys_cats.rename(
    columns={'aggregation_code':'beds_cd','aggregation_name':'school','lea_name':'district'})

# normalizing str values 
nys_cats.school = nys_cats.school.str.title()
nys_cats.district = nys_cats.district.str.title()
nys_cats.county_name = nys_cats.county_name.str.title()
nys_cats = nys_cats.reset_index(drop=True)

## Exporting Cleaned Data 

In [55]:
# before we export, need to make sure data has right dtypes 
nys_grad_rate_22.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4680 entries, 0 to 4679
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   entity_cd      4680 non-null   string
 1   entity_name    4680 non-null   object
 2   year           4680 non-null   int64 
 3   subgroup_name  4680 non-null   object
 4   grad_rate      4680 non-null   object
dtypes: int64(1), object(3), string(1)
memory usage: 182.9+ KB


We need to do some conversions, as `grad_rate` is an `dtype:object` and we need it to be a `float64`

In [62]:
nys_grad_rate_22.grad_rate = nys_grad_rate_22.grad_rate.astype('float64')
nys_grad_rate_22.entity_cd = nys_grad_rate_22.entity_cd.astype('object')

In [63]:
nys_grad_rate_22.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4680 entries, 0 to 4679
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   entity_cd      4680 non-null   object 
 1   entity_name    4680 non-null   object 
 2   year           4680 non-null   int64  
 3   subgroup_name  4680 non-null   object 
 4   grad_rate      4680 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 182.9+ KB


In [68]:
#nys_grad_rate_22.to_csv('data/2022_NYS_grad-rate.csv',index=False)

## Test with Pigs Dataset

In [46]:
from scipy import stats 
from matplotlib import style 
import seaborn as sns 
import matplotlib.pyplot as plt
import statsmodels.api as sm 
import statsmodels.formula.api as smf
style.use("fivethirtyeight")

In [47]:
df = sm.datasets.get_rdataset('dietox','geepack').data

In [48]:
len(df.Pig.unique())

72

In [66]:
df.groupby(['Pig','Time']).last()

Unnamed: 0_level_0,Unnamed: 1_level_0,Evit,Cu,Litter,Start,Weight,Feed
Pig,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4601,1,Evit000,Cu000,1,26.5,26.50000,
4601,2,Evit000,Cu000,1,26.5,27.59999,5.200005
4601,3,Evit000,Cu000,1,26.5,36.50000,17.600000
4601,4,Evit000,Cu000,1,26.5,40.29999,28.500000
4601,5,Evit000,Cu000,1,26.5,49.09998,45.200001
...,...,...,...,...,...,...,...
8442,8,Evit000,Cu175,24,25.7,73.19995,83.800003
8442,9,Evit000,Cu175,24,25.7,81.69995,99.800003
8442,10,Evit000,Cu175,24,25.7,90.29999,115.200001
8442,11,Evit000,Cu175,24,25.7,96.00000,133.200001


In [51]:
md = smf.mixedlm("Weight ~ Time", df, groups=df['Pig'])

In [52]:
mdf = md.fit(method=['lbfgs'])

In [53]:
print(mdf.summary())

         Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: Weight    
No. Observations: 861     Method:             REML      
No. Groups:       72      Scale:              11.3669   
Min. group size:  11      Log-Likelihood:     -2404.7753
Max. group size:  12      Converged:          Yes       
Mean group size:  12.0                                  
--------------------------------------------------------
             Coef.  Std.Err.    z    P>|z| [0.025 0.975]
--------------------------------------------------------
Intercept    15.724    0.788  19.952 0.000 14.179 17.268
Time          6.943    0.033 207.939 0.000  6.877  7.008
Group Var    40.395    2.149                            



In [54]:
mdf.params

Intercept    15.723523
Time          6.942505
Group Var     3.553717
dtype: float64

## Previous Attempts at Groupby 

Previous attempts trying to groupby `School` & `subgroup` -> `LinearRegression()` on each group 

In [None]:
#shared_cols = ['entity_cd','entity_name','year','subgroup_name']

In [None]:
#nys_grad_rate_22.groupby(['entity_name','subgroup_name']).last()[95:103]

In [None]:
#shared_cols

In [None]:
#pd.merge(pd.merge(nys_annual_regents,nys_grad_rate_22,how='inner',on=shared_cols),nys_absenteeism_df,
#        how='inner',on=shared_cols)

In [None]:
#pd.merge(pd.merge(nys_absenteeism_df,nys_assessment_df, on = shared_cols),nys_grad_rate_df, on = shared_cols)

In [None]:
#nys_final_df = pd.merge(nys_absenteeism_df,nys_grad_rate_df,how='inner',on=['entity_cd','entity_name','year','subgroup_name'])

In [None]:
#nys_final_df

In [None]:
#nys_final_df = nys_final_df[nys_final_df.grad_count.str.contains('s')==False]
#nys_final_df = nys_final_df[nys_final_df.absent_count.str.contains('s') == False]

In [None]:
#nys_final_df

In [None]:
#copy_1[copy_1.cohort == '6-Year']

In [None]:
#copy_2[copy_2.membership_desc.str.contains('6')]

In [None]:
#nys_grad_rate_df