Business question: What are the most significant predictors of student achievement on a school level?

In [1]:
# Import all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
import seaborn as sn

In [2]:
#View options
#'display.float_format', lambda x: '%8.0f' % x
pd.set_option('display.max_columns', None, 'display.max_rows', None)

# Data Preparation and Cleaning

In [3]:
#Dealing with characteristic and location data first (this is merge 1)
df = pd.read_csv('data/us_doe_school_characteristics.csv',encoding='cp1252')

In [4]:
df.shape

(99763, 20)

In [5]:
df1 = df.drop(['FIPST','ST','STATE_AGENCY_NO','UNION','SHARED_TIME','TITLEI_STATUS_TEXT','MAGNET_TEXT','NSLP_STATUS','NSLP_STATUS_TEXT','VIRTUAL_TEXT'], axis=1).set_index(['ST_SCHID'])

In [6]:
df1.index.dtype

dtype('O')

In [7]:
df1.head()

Unnamed: 0_level_0,SCHOOL_YEAR,STATENAME,SCH_NAME,ST_LEAID,LEAID,NCESSCH,SCHID,TITLEI_STATUS,VIRTUAL
ST_SCHID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AL-101-0010,2020-2021,ALABAMA,Albertville Middle School,AL-101,100005,10000500000.0,100870,SWELIGTGPROG,NOTVIRTUAL
AL-101-0020,2020-2021,ALABAMA,Albertville High School,AL-101,100005,10000500000.0,100871,SWELIGTGPROG,NOTVIRTUAL
AL-101-0110,2020-2021,ALABAMA,Albertville Intermediate School,AL-101,100005,10000500000.0,100879,SWELIGTGPROG,NOTVIRTUAL
AL-101-0200,2020-2021,ALABAMA,Albertville Elementary School,AL-101,100005,10000500000.0,100889,SWELIGTGPROG,NOTVIRTUAL
AL-101-0035,2020-2021,ALABAMA,Albertville Kindergarten and PreK,AL-101,100005,10000500000.0,101616,SWELIGTGPROG,NOTVIRTUAL


In [8]:
locations_df = pd.read_csv('data/us_doe_locations.csv', encoding = 'cp1252')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [9]:
locations_df.duplicated(['ST_SCHID']).sum()

0

In [10]:
df2 = locations_df[['ST_SCHID', 'LSTREET1','LCITY','LSTATE','LZIP','LEVEL']]

In [11]:
df2 = df2.set_index(['ST_SCHID'])

In [12]:
df1.shape

(99763, 9)

In [13]:
df2.shape

(101662, 5)

In [14]:
df2.dtypes

LSTREET1    object
LCITY       object
LSTATE      object
LZIP         int64
LEVEL       object
dtype: object

In [15]:
merged1 = df1.merge(df2, how='outer', left_index=True, right_index=True).reset_index()

In [16]:
merged1.shape

(101662, 15)

In [17]:
merged1.isna().sum()

ST_SCHID            0
SCHOOL_YEAR      1899
STATENAME        1899
SCH_NAME         1899
ST_LEAID         1899
LEAID            1899
NCESSCH          1899
SCHID            1899
TITLEI_STATUS    1899
VIRTUAL          1899
LSTREET1            3
LCITY               0
LSTATE              0
LZIP                0
LEVEL               0
dtype: int64

In [18]:
#About 2% of our data in this first merged dataframe contain nulls. Let's get rid of them
merged1 = merged1.dropna()

In [19]:
merged1['NCESSCH'] = merged1['NCESSCH'].astype(int)

In [20]:
enrolls = pd.read_csv('data/us_doe_enrollments.csv',encoding='cp1252')

In [21]:
enrolls.duplicated(['State School ID [Public School] 2020-21']).sum()

6

In [22]:
enrolls_dupes = enrolls.loc[enrolls.duplicated(['State School ID [Public School] 2020-21'])]

In [23]:
df4 = enrolls.drop_duplicates(subset = ['State School ID [Public School] 2020-21']).dropna()

In [24]:
df4.dtypes

ï»¿School Name                                                      object
State Name [Public School] Latest available year                    object
School ID - NCES Assigned [Public School] Latest available year    float64
State Name [Public School] 2020-21                                  object
School Name [Public School] 2020-21                                 object
Agency ID - NCES Assigned [Public School] Latest available year    float64
ANSI/FIPS State Code [Public School] Latest available year         float64
State Agency ID [Public School] 2020-21                             object
State School ID [Public School] 2020-21                             object
Total Students All Grades (Excludes AE) [Public School] 2020-21     object
Pupil/Teacher Ratio [Public School] 2020-21                         object
dtype: object

In [25]:
df4.shape

(99568, 11)

In [26]:
merged1.dtypes

ST_SCHID          object
SCHOOL_YEAR       object
STATENAME         object
SCH_NAME          object
ST_LEAID          object
LEAID            float64
NCESSCH            int64
SCHID            float64
TITLEI_STATUS     object
VIRTUAL           object
LSTREET1          object
LCITY             object
LSTATE            object
LZIP               int64
LEVEL             object
dtype: object

In [27]:
merged1.shape

(99760, 15)

In [28]:
enroll2 = df4[['State School ID [Public School] 2020-21','Total Students All Grades (Excludes AE) [Public School] 2020-21','Pupil/Teacher Ratio [Public School] 2020-21']]


In [29]:
enroll_final = enroll2.rename(columns={'State School ID [Public School] 2020-21': 'ST_SCHID', 'Total Students All Grades (Excludes AE) [Public School] 2020-21': 'Total Students', 'Pupil/Teacher Ratio [Public School] 2020-21':'Student:Teacher Ratio'})
                                       

In [30]:
enroll_final.head()

Unnamed: 0,ST_SCHID,Total Students,Student:Teacher Ratio
0,MA-0170-01700045,1139,11.89
1,NV-02-02093,352,35.2
2,NV-02-02364,163,16.3
3,WA-31025-1656,175,23.33
4,MN-010112-010112067,34,â€“


In [31]:
us_df = pd.merge(merged1, enroll_final, on='ST_SCHID',how='right')

In [32]:
us_df.head()

Unnamed: 0,ST_SCHID,SCHOOL_YEAR,STATENAME,SCH_NAME,ST_LEAID,LEAID,NCESSCH,SCHID,TITLEI_STATUS,VIRTUAL,LSTREET1,LCITY,LSTATE,LZIP,LEVEL,Total Students,Student:Teacher Ratio
0,MA-0170-01700045,2020-2021,MASSACHUSETTS,1 LT Charles W. Whitcomb School,MA-0170,2507320.0,250732000000.0,2502639.0,SWELIGSWPROG,SUPPVIRTUAL,25 Union Street,Marlborough,MA,1752.0,Middle,1139,11.89
1,NV-02-02093,2020-2021,NEVADA,100 Academy of Excellence ES,NV-02,3200060.0,320006000000.0,3200670.0,SWELIGSWPROG,NOTVIRTUAL,2341 Comstock Dr,N Las Vegas,NV,89032.0,Elementary,352,35.2
2,NV-02-02364,2020-2021,NEVADA,100 Academy of Excellence MS,NV-02,3200060.0,320006000000.0,3200756.0,SWELIGSWPROG,NOTVIRTUAL,2341 Comstock Dr,N Las Vegas,NV,89032.0,Middle,163,16.3
3,WA-31025-1656,2020-2021,WASHINGTON,10th Street School,WA-31025,5304860.0,530486000000.0,5302475.0,NOTTITLE1ELIG,NOTVIRTUAL,7204 27th Ave NE,Marysville,WA,98271.0,Middle,175,23.33
4,MN-010112-010112067,2020-2021,MINNESOTA,112 ALC Independent Study,MN-010112,2708190.0,270819000000.0,2704415.0,TGELGBNOPROG,NOTVIRTUAL,11 PEAVEY RD,CHASKA,MN,55317.0,High,34,â€“


In [33]:
maybe_final = us_df[['ST_SCHID','STATENAME','SCH_NAME','LEAID','ST_LEAID','TITLEI_STATUS','VIRTUAL','LSTREET1','LCITY','LSTATE','LZIP','LEVEL','Total Students', 'Student:Teacher Ratio']]

In [34]:
maybe_final.head()

Unnamed: 0,ST_SCHID,STATENAME,SCH_NAME,LEAID,ST_LEAID,TITLEI_STATUS,VIRTUAL,LSTREET1,LCITY,LSTATE,LZIP,LEVEL,Total Students,Student:Teacher Ratio
0,MA-0170-01700045,MASSACHUSETTS,1 LT Charles W. Whitcomb School,2507320.0,MA-0170,SWELIGSWPROG,SUPPVIRTUAL,25 Union Street,Marlborough,MA,1752.0,Middle,1139,11.89
1,NV-02-02093,NEVADA,100 Academy of Excellence ES,3200060.0,NV-02,SWELIGSWPROG,NOTVIRTUAL,2341 Comstock Dr,N Las Vegas,NV,89032.0,Elementary,352,35.2
2,NV-02-02364,NEVADA,100 Academy of Excellence MS,3200060.0,NV-02,SWELIGSWPROG,NOTVIRTUAL,2341 Comstock Dr,N Las Vegas,NV,89032.0,Middle,163,16.3
3,WA-31025-1656,WASHINGTON,10th Street School,5304860.0,WA-31025,NOTTITLE1ELIG,NOTVIRTUAL,7204 27th Ave NE,Marysville,WA,98271.0,Middle,175,23.33
4,MN-010112-010112067,MINNESOTA,112 ALC Independent Study,2708190.0,MN-010112,TGELGBNOPROG,NOTVIRTUAL,11 PEAVEY RD,CHASKA,MN,55317.0,High,34,â€“


In [35]:
pops = pd.read_csv("data/population_by_zip_2010.csv")

In [36]:
pops.head()

Unnamed: 0,population,minimum_age,maximum_age,gender,zipcode,geo_id
0,50,30.0,34.0,female,61747,8600000US61747
1,5,85.0,,male,64120,8600000US64120
2,1389,30.0,34.0,male,95117,8600000US95117
3,231,60.0,61.0,female,74074,8600000US74074
4,56,0.0,4.0,female,58042,8600000US58042


In [37]:
maybe_final.head()

Unnamed: 0,ST_SCHID,STATENAME,SCH_NAME,LEAID,ST_LEAID,TITLEI_STATUS,VIRTUAL,LSTREET1,LCITY,LSTATE,LZIP,LEVEL,Total Students,Student:Teacher Ratio
0,MA-0170-01700045,MASSACHUSETTS,1 LT Charles W. Whitcomb School,2507320.0,MA-0170,SWELIGSWPROG,SUPPVIRTUAL,25 Union Street,Marlborough,MA,1752.0,Middle,1139,11.89
1,NV-02-02093,NEVADA,100 Academy of Excellence ES,3200060.0,NV-02,SWELIGSWPROG,NOTVIRTUAL,2341 Comstock Dr,N Las Vegas,NV,89032.0,Elementary,352,35.2
2,NV-02-02364,NEVADA,100 Academy of Excellence MS,3200060.0,NV-02,SWELIGSWPROG,NOTVIRTUAL,2341 Comstock Dr,N Las Vegas,NV,89032.0,Middle,163,16.3
3,WA-31025-1656,WASHINGTON,10th Street School,5304860.0,WA-31025,NOTTITLE1ELIG,NOTVIRTUAL,7204 27th Ave NE,Marysville,WA,98271.0,Middle,175,23.33
4,MN-010112-010112067,MINNESOTA,112 ALC Independent Study,2708190.0,MN-010112,TGELGBNOPROG,NOTVIRTUAL,11 PEAVEY RD,CHASKA,MN,55317.0,High,34,â€“


In [38]:
maybe_final['LZIP'] = maybe_final['LZIP'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_final['LZIP'] = maybe_final['LZIP'].astype(str)


In [39]:
maybe_final.head()

Unnamed: 0,ST_SCHID,STATENAME,SCH_NAME,LEAID,ST_LEAID,TITLEI_STATUS,VIRTUAL,LSTREET1,LCITY,LSTATE,LZIP,LEVEL,Total Students,Student:Teacher Ratio
0,MA-0170-01700045,MASSACHUSETTS,1 LT Charles W. Whitcomb School,2507320.0,MA-0170,SWELIGSWPROG,SUPPVIRTUAL,25 Union Street,Marlborough,MA,1752.0,Middle,1139,11.89
1,NV-02-02093,NEVADA,100 Academy of Excellence ES,3200060.0,NV-02,SWELIGSWPROG,NOTVIRTUAL,2341 Comstock Dr,N Las Vegas,NV,89032.0,Elementary,352,35.2
2,NV-02-02364,NEVADA,100 Academy of Excellence MS,3200060.0,NV-02,SWELIGSWPROG,NOTVIRTUAL,2341 Comstock Dr,N Las Vegas,NV,89032.0,Middle,163,16.3
3,WA-31025-1656,WASHINGTON,10th Street School,5304860.0,WA-31025,NOTTITLE1ELIG,NOTVIRTUAL,7204 27th Ave NE,Marysville,WA,98271.0,Middle,175,23.33
4,MN-010112-010112067,MINNESOTA,112 ALC Independent Study,2708190.0,MN-010112,TGELGBNOPROG,NOTVIRTUAL,11 PEAVEY RD,CHASKA,MN,55317.0,High,34,â€“


In [40]:
maybe_final['LZIP'] = maybe_final['LZIP'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_final['LZIP'] = maybe_final['LZIP'].astype(float)


In [41]:
maybe_final.isna().sum()

ST_SCHID                   0
STATENAME                962
SCH_NAME                 962
LEAID                    962
ST_LEAID                 962
TITLEI_STATUS            962
VIRTUAL                  962
LSTREET1                 962
LCITY                    962
LSTATE                   962
LZIP                     962
LEVEL                    962
Total Students             0
Student:Teacher Ratio      0
dtype: int64

In [42]:
maybe_final = maybe_final.dropna()

In [43]:
maybe_final['LZIP'] = maybe_final['LZIP'].astype(int)

In [44]:
maybe_final.head()

Unnamed: 0,ST_SCHID,STATENAME,SCH_NAME,LEAID,ST_LEAID,TITLEI_STATUS,VIRTUAL,LSTREET1,LCITY,LSTATE,LZIP,LEVEL,Total Students,Student:Teacher Ratio
0,MA-0170-01700045,MASSACHUSETTS,1 LT Charles W. Whitcomb School,2507320.0,MA-0170,SWELIGSWPROG,SUPPVIRTUAL,25 Union Street,Marlborough,MA,1752,Middle,1139,11.89
1,NV-02-02093,NEVADA,100 Academy of Excellence ES,3200060.0,NV-02,SWELIGSWPROG,NOTVIRTUAL,2341 Comstock Dr,N Las Vegas,NV,89032,Elementary,352,35.2
2,NV-02-02364,NEVADA,100 Academy of Excellence MS,3200060.0,NV-02,SWELIGSWPROG,NOTVIRTUAL,2341 Comstock Dr,N Las Vegas,NV,89032,Middle,163,16.3
3,WA-31025-1656,WASHINGTON,10th Street School,5304860.0,WA-31025,NOTTITLE1ELIG,NOTVIRTUAL,7204 27th Ave NE,Marysville,WA,98271,Middle,175,23.33
4,MN-010112-010112067,MINNESOTA,112 ALC Independent Study,2708190.0,MN-010112,TGELGBNOPROG,NOTVIRTUAL,11 PEAVEY RD,CHASKA,MN,55317,High,34,â€“


In [45]:
pops2 = pops.drop(columns = ['minimum_age','maximum_age','gender','geo_id'], axis=1)

In [46]:
pops2.head()

Unnamed: 0,population,zipcode
0,50,61747
1,5,64120
2,1389,95117
3,231,74074
4,56,58042


In [47]:
pops2.shape

(1622831, 2)

In [48]:
#Aggregating 
zipcode_pops = pops2.groupby(['zipcode']).sum().reset_index()

In [49]:
zipcode_pops.shape

(33119, 2)

In [50]:
zipcode_pops.dtypes

zipcode       int64
population    int64
dtype: object

In [51]:
maybe_final.shape

(98606, 14)

In [52]:
#merge zipcodes dataframe with schools dataframe
final = zipcode_pops.merge(maybe_final, how='inner', left_on='zipcode', right_on='LZIP')

In [53]:
final.shape

(98074, 16)

In [54]:
final.head()

Unnamed: 0,zipcode,population,ST_SCHID,STATENAME,SCH_NAME,LEAID,ST_LEAID,TITLEI_STATUS,VIRTUAL,LSTREET1,LCITY,LSTATE,LZIP,LEVEL,Total Students,Student:Teacher Ratio
0,1001,50307,MA-0005-00050003,MASSACHUSETTS,Agawam Early Childhood Center,2501800.0,MA-0005,NOTTITLE1ELIG,SUPPVIRTUAL,108 Perry Lane,Agawam,MA,1001,Prekindergarten,106,13.25
1,1001,50307,MA-0005-00050505,MASSACHUSETTS,Agawam High,2501800.0,MA-0005,NOTTITLE1ELIG,SUPPVIRTUAL,760 Cooper Street,Agawam,MA,1001,High,1050,11.93
2,1001,50307,MA-0005-00050020,MASSACHUSETTS,Benjamin J Phelps,2501800.0,MA-0005,SWELIGSWPROG,SUPPVIRTUAL,689 Main Street,Agawam,MA,1001,Elementary,350,12.89
3,1001,50307,MA-0005-00050030,MASSACHUSETTS,James Clark School,2501800.0,MA-0005,SWELIGSWPROG,SUPPVIRTUAL,65 Oxford Street,Agawam,MA,1001,Elementary,269,10.02
4,1001,50307,MA-0005-00050303,MASSACHUSETTS,Roberta G. Doering School,2501800.0,MA-0005,NOTTITLE1ELIG,SUPPVIRTUAL,68 Main Street,Agawam,MA,1001,Middle,527,11.21


In [55]:
final['density'] = np.nan

In [56]:
# Enumerate to replace values in ['density'] with population category
for i, element in enumerate(final['population']): 
    if element >= 100000:
        final['density'][i] = 'urban'
    elif element >= 25000 & element <= 99999: 
        final['density'][i] = 'midsize'
    elif element <= 24999:
        final['density'][i] = 'rural'  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['density'][i] = 'midsize'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['density'][i] = 'urban'


In [57]:
final.head()

Unnamed: 0,zipcode,population,ST_SCHID,STATENAME,SCH_NAME,LEAID,ST_LEAID,TITLEI_STATUS,VIRTUAL,LSTREET1,LCITY,LSTATE,LZIP,LEVEL,Total Students,Student:Teacher Ratio,density
0,1001,50307,MA-0005-00050003,MASSACHUSETTS,Agawam Early Childhood Center,2501800.0,MA-0005,NOTTITLE1ELIG,SUPPVIRTUAL,108 Perry Lane,Agawam,MA,1001,Prekindergarten,106,13.25,midsize
1,1001,50307,MA-0005-00050505,MASSACHUSETTS,Agawam High,2501800.0,MA-0005,NOTTITLE1ELIG,SUPPVIRTUAL,760 Cooper Street,Agawam,MA,1001,High,1050,11.93,midsize
2,1001,50307,MA-0005-00050020,MASSACHUSETTS,Benjamin J Phelps,2501800.0,MA-0005,SWELIGSWPROG,SUPPVIRTUAL,689 Main Street,Agawam,MA,1001,Elementary,350,12.89,midsize
3,1001,50307,MA-0005-00050030,MASSACHUSETTS,James Clark School,2501800.0,MA-0005,SWELIGSWPROG,SUPPVIRTUAL,65 Oxford Street,Agawam,MA,1001,Elementary,269,10.02,midsize
4,1001,50307,MA-0005-00050303,MASSACHUSETTS,Roberta G. Doering School,2501800.0,MA-0005,NOTTITLE1ELIG,SUPPVIRTUAL,68 Main Street,Agawam,MA,1001,Middle,527,11.21,midsize


In [58]:
districts = pd.read_excel('data/elsec19t.xls')

In [59]:
district_info = districts[['NAME','NCESID','LOCRPROP','PCTLTAXP','PPCSTOT','PPSPUPIL','PPSSTAFF','PPISALWG']]

In [60]:
district_info.head()

Unnamed: 0,NAME,NCESID,LOCRPROP,PCTLTAXP,PPCSTOT,PPSPUPIL,PPSSTAFF,PPISALWG
0,AUTAUGA COUNTY SCHOOL DISTRICT,100240,7637,8.7,8600,512,241,3357
1,BALDWIN COUNTY SCHOOL DISTRICT,100270,57153,14.4,10046,620,396,3665
2,BARBOUR COUNTY SCHOOL DISTRICT,100300,1429,14.3,12328,518,474,3816
3,EUFAULA CITY SCHOOL DISTRICT,101410,3208,7.6,5298,321,151,1863
4,BIBB COUNTY SCHOOL DISTRICT,100360,1571,4.5,9909,656,504,3361


In [61]:
district_info.dtypes

NAME         object
NCESID       object
LOCRPROP      int64
PCTLTAXP    float64
PPCSTOT       int64
PPSPUPIL      int64
PPSSTAFF      int64
PPISALWG      int64
dtype: object

In [62]:
final.head()

Unnamed: 0,zipcode,population,ST_SCHID,STATENAME,SCH_NAME,LEAID,ST_LEAID,TITLEI_STATUS,VIRTUAL,LSTREET1,LCITY,LSTATE,LZIP,LEVEL,Total Students,Student:Teacher Ratio,density
0,1001,50307,MA-0005-00050003,MASSACHUSETTS,Agawam Early Childhood Center,2501800.0,MA-0005,NOTTITLE1ELIG,SUPPVIRTUAL,108 Perry Lane,Agawam,MA,1001,Prekindergarten,106,13.25,midsize
1,1001,50307,MA-0005-00050505,MASSACHUSETTS,Agawam High,2501800.0,MA-0005,NOTTITLE1ELIG,SUPPVIRTUAL,760 Cooper Street,Agawam,MA,1001,High,1050,11.93,midsize
2,1001,50307,MA-0005-00050020,MASSACHUSETTS,Benjamin J Phelps,2501800.0,MA-0005,SWELIGSWPROG,SUPPVIRTUAL,689 Main Street,Agawam,MA,1001,Elementary,350,12.89,midsize
3,1001,50307,MA-0005-00050030,MASSACHUSETTS,James Clark School,2501800.0,MA-0005,SWELIGSWPROG,SUPPVIRTUAL,65 Oxford Street,Agawam,MA,1001,Elementary,269,10.02,midsize
4,1001,50307,MA-0005-00050303,MASSACHUSETTS,Roberta G. Doering School,2501800.0,MA-0005,NOTTITLE1ELIG,SUPPVIRTUAL,68 Main Street,Agawam,MA,1001,Middle,527,11.21,midsize


In [63]:
final['LEAID'] = maybe_final['LEAID'].astype(int)

In [64]:
district_info.loc[district_info['NCESID'] == '09D0001']

Unnamed: 0,NAME,NCESID,LOCRPROP,PCTLTAXP,PPCSTOT,PPSPUPIL,PPSSTAFF,PPISALWG
2074,COMMITTEE FOR SHARED SERVICES,09D0001,0,0.0,0,0,0,0


In [65]:
district_info['ncesid_alpha'] = list(map(lambda x: x.isnumeric(), district_info['NCESID']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  district_info['ncesid_alpha'] = list(map(lambda x: x.isnumeric(), district_info['NCESID']))


In [66]:
district_info['ncesid_alpha'].value_counts()

True     14188
False        9
Name: ncesid_alpha, dtype: int64

In [67]:
district_info = district_info.drop(district_info[(district_info.ncesid_alpha == False)].index)
                                                 

In [68]:
district_info['NCESID'] = district_info['NCESID'].astype(int)

In [69]:
merged2 = district_info.merge(final, how='inner',left_on='NCESID', right_on='LEAID')

In [70]:
merged2.head()

Unnamed: 0,NAME,NCESID,LOCRPROP,PCTLTAXP,PPCSTOT,PPSPUPIL,PPSSTAFF,PPISALWG,ncesid_alpha,zipcode,population,ST_SCHID,STATENAME,SCH_NAME,LEAID,ST_LEAID,TITLEI_STATUS,VIRTUAL,LSTREET1,LCITY,LSTATE,LZIP,LEVEL,Total Students,Student:Teacher Ratio,density
0,AUTAUGA COUNTY SCHOOL DISTRICT,100240,7637,8.7,8600,512,241,3357,True,6704,75417,CT-1510011-1516311,CONNECTICUT,Wilby High School,100240.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,568 Bucks Hill Rd.,Waterbury,CT,6704,High,1091,14.47,midsize
1,AUTAUGA COUNTY SCHOOL DISTRICT,100240,7637,8.7,8600,512,241,3357,True,6704,75417,CT-1510011-1513211,CONNECTICUT,Woodrow Wilson School,100240.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,235 Birch St.,Waterbury,CT,6704,Elementary,414,13.89,midsize
2,AUTAUGA COUNTY SCHOOL DISTRICT,100240,7637,8.7,8600,512,241,3357,True,6705,81366,CT-1510011-1516211,CONNECTICUT,Crosby High School,100240.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,300 Pierpont Rd.,Waterbury,CT,6705,High,1217,13.14,midsize
3,AUTAUGA COUNTY SCHOOL DISTRICT,100240,7637,8.7,8600,512,241,3357,True,10032,171993,NY-310600861013-310600861013,NEW YORK,KIPP NYC WASHINGTON HEIGHTS ACADEMY CHARTER SC...,100240.0,NY-310600861013,SWELIGSWPROG,SUPPVIRTUAL,21 JUMEL PL-RM D109,NEW YORK,NY,10032,Elementary,844,14.47,urban
4,AUTAUGA COUNTY SCHOOL DISTRICT,100240,7637,8.7,8600,512,241,3357,True,27701,63864,NC-32M-000,NORTH CAROLINA,Global Scholars Academy,100240.0,NC-32M,SWELIGSWPROG,NOTVIRTUAL,311 Dowd St,Durham,NC,27701,Elementary,206,11.31,midsize


In [71]:
income = pd.read_csv('data/income.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [72]:
income = income[['NAME','S1903_C03_001E']]
income.head()

Unnamed: 0,NAME,S1903_C03_001E
0,Geographic Area Name,Estimate!!Median income (dollars)!!HOUSEHOLD I...
1,ZCTA5 00601,14361
2,ZCTA5 00602,16807
3,ZCTA5 00603,16049
4,ZCTA5 00606,12119


In [73]:
#Clean up column headings and rows
income = income.iloc[1: , :].rename(columns={'NAME':'zipcode', 'S1903_C03_001E':'median_income'})

In [74]:
#Remove string of first 5 characters from each entry in zip code column
income['zipcode'] = income['zipcode'].map(lambda x: x.lstrip('ZCTA5 ')) 

In [75]:
income.head()

Unnamed: 0,zipcode,median_income
1,601,14361
2,602,16807
3,603,16049
4,606,12119
5,610,19898


In [76]:
income.dtypes

zipcode          object
median_income    object
dtype: object

In [77]:
income['zipcode'] = income['zipcode'].astype(int)

In [78]:
income.shape

(33120, 2)

In [79]:
merged2.shape

(88761, 26)

In [80]:
#Merge median income data with school data
merged3 = merged2.merge(income, how='inner', left_on='zipcode',right_on='zipcode')

In [81]:
merged3.shape

(83780, 27)

In [82]:
merged3.head()

Unnamed: 0,NAME,NCESID,LOCRPROP,PCTLTAXP,PPCSTOT,PPSPUPIL,PPSSTAFF,PPISALWG,ncesid_alpha,zipcode,population,ST_SCHID,STATENAME,SCH_NAME,LEAID,ST_LEAID,TITLEI_STATUS,VIRTUAL,LSTREET1,LCITY,LSTATE,LZIP,LEVEL,Total Students,Student:Teacher Ratio,density,median_income
0,AUTAUGA COUNTY SCHOOL DISTRICT,100240,7637,8.7,8600,512,241,3357,True,6704,75417,CT-1510011-1516311,CONNECTICUT,Wilby High School,100240.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,568 Bucks Hill Rd.,Waterbury,CT,6704,High,1091,14.47,midsize,40625
1,AUTAUGA COUNTY SCHOOL DISTRICT,100240,7637,8.7,8600,512,241,3357,True,6704,75417,CT-1510011-1513211,CONNECTICUT,Woodrow Wilson School,100240.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,235 Birch St.,Waterbury,CT,6704,Elementary,414,13.89,midsize,40625
2,DECATUR CITY SCHOOL DISTRICT,101170,19527,17.5,11406,590,436,4418,True,6704,75417,CT-1510011-1512011,CONNECTICUT,Sprague School,101170.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,1443 Thomaston Ave.,Waterbury,CT,6704,Elementary,388,11.9,midsize,40625
3,GENEVA AREA CITY SCH DIST,3904405,8456,33.4,9876,614,80,4060,True,6704,75417,CT-1510011-1510511,CONNECTICUT,Bucks Hill School,3904405.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,330 Bucks Hill Rd.,Waterbury,CT,6704,Elementary,312,8.13,midsize,40625
4,AUSTINTOWN SCH DIST,3904829,19943,35.7,10253,694,290,3987,True,6704,75417,CT-1510011-1515311,CONNECTICUT,North End Middle School,3904829.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,534 Bucks Hill Rd.,Waterbury,CT,6704,Middle,896,11.55,midsize,40625


In [83]:
#Read in target data
target = pd.read_csv('data/scores.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [84]:
target.head()

Unnamed: 0,STNAM,FIPST,LEANM,LEAID,ST_LEAID,NCESSCH,ST_SCHID,SCHNAM,DATE_CUR,ALL_RLA00numvalid_1819,ALL_RLA00pctprof_1819,MAM_RLA00numvalid_1819,MAM_RLA00pctprof_1819,MAS_RLA00numvalid_1819,MAS_RLA00pctprof_1819,MBL_RLA00numvalid_1819,MBL_RLA00pctprof_1819,MHI_RLA00numvalid_1819,MHI_RLA00pctprof_1819,MTR_RLA00numvalid_1819,MTR_RLA00pctprof_1819,MWH_RLA00numvalid_1819,MWH_RLA00pctprof_1819,F_RLA00numvalid_1819,F_RLA00pctprof_1819,M_RLA00numvalid_1819,M_RLA00pctprof_1819,CWD_RLA00numvalid_1819,CWD_RLA00pctprof_1819,ECD_RLA00numvalid_1819,ECD_RLA00pctprof_1819,FCS_RLA00numvalid_1819,FCS_RLA00pctprof_1819,LEP_RLA00numvalid_1819,LEP_RLA00pctprof_1819,HOM_RLA00numvalid_1819,HOM_RLA00pctprof_1819,MIG_RLA00numvalid_1819,MIG_RLA00pctprof_1819,MIL_RLA00numvalid_1819,MIL_RLA00pctprof_1819,ALL_RLA03numvalid_1819,ALL_RLA03pctprof_1819,ALL_RLA04numvalid_1819,ALL_RLA04pctprof_1819,ALL_RLA05numvalid_1819,ALL_RLA05pctprof_1819,ALL_RLA06numvalid_1819,ALL_RLA06pctprof_1819,ALL_RLA07numvalid_1819,ALL_RLA07pctprof_1819,ALL_RLA08numvalid_1819,ALL_RLA08pctprof_1819,ALL_RLAHSnumvalid_1819,ALL_RLAHSpctprof_1819,MAM_RLA03numvalid_1819,MAM_RLA03pctprof_1819,MAM_RLA04numvalid_1819,MAM_RLA04pctprof_1819,MAM_RLA05numvalid_1819,MAM_RLA05pctprof_1819,MAM_RLA06numvalid_1819,MAM_RLA06pctprof_1819,MAM_RLA07numvalid_1819,MAM_RLA07pctprof_1819,MAM_RLA08numvalid_1819,MAM_RLA08pctprof_1819,MAM_RLAHSnumvalid_1819,MAM_RLAHSpctprof_1819,MAS_RLA03numvalid_1819,MAS_RLA03pctprof_1819,MAS_RLA04numvalid_1819,MAS_RLA04pctprof_1819,MAS_RLA05numvalid_1819,MAS_RLA05pctprof_1819,MAS_RLA06numvalid_1819,MAS_RLA06pctprof_1819,MAS_RLA07numvalid_1819,MAS_RLA07pctprof_1819,MAS_RLA08numvalid_1819,MAS_RLA08pctprof_1819,MAS_RLAHSnumvalid_1819,MAS_RLAHSpctprof_1819,MBL_RLA03numvalid_1819,MBL_RLA03pctprof_1819,MBL_RLA04numvalid_1819,MBL_RLA04pctprof_1819,MBL_RLA05numvalid_1819,MBL_RLA05pctprof_1819,MBL_RLA06numvalid_1819,MBL_RLA06pctprof_1819,MBL_RLA07numvalid_1819,MBL_RLA07pctprof_1819,MBL_RLA08numvalid_1819,MBL_RLA08pctprof_1819,MBL_RLAHSnumvalid_1819,MBL_RLAHSpctprof_1819,MHI_RLA03numvalid_1819,MHI_RLA03pctprof_1819,MHI_RLA04numvalid_1819,MHI_RLA04pctprof_1819,MHI_RLA05numvalid_1819,MHI_RLA05pctprof_1819,MHI_RLA06numvalid_1819,MHI_RLA06pctprof_1819,MHI_RLA07numvalid_1819,MHI_RLA07pctprof_1819,MHI_RLA08numvalid_1819,MHI_RLA08pctprof_1819,MHI_RLAHSnumvalid_1819,MHI_RLAHSpctprof_1819,MTR_RLA03numvalid_1819,MTR_RLA03pctprof_1819,MTR_RLA04numvalid_1819,MTR_RLA04pctprof_1819,MTR_RLA05numvalid_1819,MTR_RLA05pctprof_1819,MTR_RLA06numvalid_1819,MTR_RLA06pctprof_1819,MTR_RLA07numvalid_1819,MTR_RLA07pctprof_1819,MTR_RLA08numvalid_1819,MTR_RLA08pctprof_1819,MTR_RLAHSnumvalid_1819,MTR_RLAHSpctprof_1819,MWH_RLA03numvalid_1819,MWH_RLA03pctprof_1819,MWH_RLA04numvalid_1819,MWH_RLA04pctprof_1819,MWH_RLA05numvalid_1819,MWH_RLA05pctprof_1819,MWH_RLA06numvalid_1819,MWH_RLA06pctprof_1819,MWH_RLA07numvalid_1819,MWH_RLA07pctprof_1819,MWH_RLA08numvalid_1819,MWH_RLA08pctprof_1819,MWH_RLAHSnumvalid_1819,MWH_RLAHSpctprof_1819,F_RLA03numvalid_1819,F_RLA03pctprof_1819,F_RLA04numvalid_1819,F_RLA04pctprof_1819,F_RLA05numvalid_1819,F_RLA05pctprof_1819,F_RLA06numvalid_1819,F_RLA06pctprof_1819,F_RLA07numvalid_1819,F_RLA07pctprof_1819,F_RLA08numvalid_1819,F_RLA08pctprof_1819,F_RLAHSnumvalid_1819,F_RLAHSpctprof_1819,M_RLA03numvalid_1819,M_RLA03pctprof_1819,M_RLA04numvalid_1819,M_RLA04pctprof_1819,M_RLA05numvalid_1819,M_RLA05pctprof_1819,M_RLA06numvalid_1819,M_RLA06pctprof_1819,M_RLA07numvalid_1819,M_RLA07pctprof_1819,M_RLA08numvalid_1819,M_RLA08pctprof_1819,M_RLAHSnumvalid_1819,M_RLAHSpctprof_1819,CWD_RLA03numvalid_1819,CWD_RLA03pctprof_1819,CWD_RLA04numvalid_1819,CWD_RLA04pctprof_1819,CWD_RLA05numvalid_1819,CWD_RLA05pctprof_1819,CWD_RLA06numvalid_1819,CWD_RLA06pctprof_1819,CWD_RLA07numvalid_1819,CWD_RLA07pctprof_1819,CWD_RLA08numvalid_1819,CWD_RLA08pctprof_1819,CWD_RLAHSnumvalid_1819,CWD_RLAHSpctprof_1819,ECD_RLA03numvalid_1819,ECD_RLA03pctprof_1819,ECD_RLA04numvalid_1819,ECD_RLA04pctprof_1819,ECD_RLA05numvalid_1819,ECD_RLA05pctprof_1819,ECD_RLA06numvalid_1819,ECD_RLA06pctprof_1819,ECD_RLA07numvalid_1819,ECD_RLA07pctprof_1819,ECD_RLA08numvalid_1819,ECD_RLA08pctprof_1819,ECD_RLAHSnumvalid_1819,ECD_RLAHSpctprof_1819,FCS_RLA03numvalid_1819,FCS_RLA03pctprof_1819,FCS_RLA04numvalid_1819,FCS_RLA04pctprof_1819,FCS_RLA05numvalid_1819,FCS_RLA05pctprof_1819,FCS_RLA06numvalid_1819,FCS_RLA06pctprof_1819,FCS_RLA07numvalid_1819,FCS_RLA07pctprof_1819,FCS_RLA08numvalid_1819,FCS_RLA08pctprof_1819,FCS_RLAHSnumvalid_1819,FCS_RLAHSpctprof_1819,LEP_RLA03numvalid_1819,LEP_RLA03pctprof_1819,LEP_RLA04numvalid_1819,LEP_RLA04pctprof_1819,LEP_RLA05numvalid_1819,LEP_RLA05pctprof_1819,LEP_RLA06numvalid_1819,LEP_RLA06pctprof_1819,LEP_RLA07numvalid_1819,LEP_RLA07pctprof_1819,LEP_RLA08numvalid_1819,LEP_RLA08pctprof_1819,LEP_RLAHSnumvalid_1819,LEP_RLAHSpctprof_1819,HOM_RLA03numvalid_1819,HOM_RLA03pctprof_1819,HOM_RLA04numvalid_1819,HOM_RLA04pctprof_1819,HOM_RLA05numvalid_1819,HOM_RLA05pctprof_1819,HOM_RLA06numvalid_1819,HOM_RLA06pctprof_1819,HOM_RLA07numvalid_1819,HOM_RLA07pctprof_1819,HOM_RLA08numvalid_1819,HOM_RLA08pctprof_1819,HOM_RLAHSnumvalid_1819,HOM_RLAHSpctprof_1819,MIG_RLA03numvalid_1819,MIG_RLA03pctprof_1819,MIG_RLA04numvalid_1819,MIG_RLA04pctprof_1819,MIG_RLA05numvalid_1819,MIG_RLA05pctprof_1819,MIG_RLA06numvalid_1819,MIG_RLA06pctprof_1819,MIG_RLA07numvalid_1819,MIG_RLA07pctprof_1819,MIG_RLA08numvalid_1819,MIG_RLA08pctprof_1819,MIG_RLAHSnumvalid_1819,MIG_RLAHSpctprof_1819,MIL_RLA03numvalid_1819,MIL_RLA03pctprof_1819,MIL_RLA04numvalid_1819,MIL_RLA04pctprof_1819,MIL_RLA05numvalid_1819,MIL_RLA05pctprof_1819,MIL_RLA06numvalid_1819,MIL_RLA06pctprof_1819,MIL_RLA07numvalid_1819,MIL_RLA07pctprof_1819,MIL_RLA08numvalid_1819,MIL_RLA08pctprof_1819,MIL_RLAHSnumvalid_1819,MIL_RLAHSpctprof_1819
0,ALABAMA,1,Albertville City,100005,AL-101,10000500870,AL-101-0010,Albertville Middle School,13AUG20,796,37,,,3.0,PS,31,20-29,392,23,17,40-59,353,54,382,39,414,36,41,20-29,308,26,2,PS,89,LE5,10,LT50,34.0,11-19,20.0,40-59,,,,,,,,,396.0,36,400.0,39,,,,,,,,,,,,,,,,,,,,,,,,,1.0,PS,2.0,PS,,,,,,,,,,,15.0,LT50,16.0,21-39,,,,,,,,,,,195.0,20-24,197.0,25-29,,,,,,,,,,,9.0,GE50,8.0,LT50,,,,,,,,,,,176.0,50-54,177.0,55-59,,,,,,,,,,,183.0,35-39,199.0,40-44,,,,,,,,,,,213.0,30-34,201.0,35-39,,,,,,,,,,,17.0,21-39,24.0,21-39,,,,,,,,,,,155.0,25-29,153.0,25-29,,,,,,,,,,,1.0,PS,1.0,PS,,,,,,,,,,,45.0,LE10,44.0,LE10,,,,,,,,,,,6.0,LT50,4.0,PS,,,,,,,,,,,18.0,LE20,16.0,LE20,,,,,,,,,,,6.0,GE50,14.0,GE50,,
1,ALABAMA,1,Albertville City,100005,AL-101,10000500871,AL-101-0020,Albertville High School,13AUG20,317,31,2.0,PS,,,10,LT50,139,15-19,8,LT50,158,40-44,145,35-39,172,20-24,22,LE20,96,25-29,1,PS,35,LE10,3,PS,9.0,LT50,7.0,LT50,,,,,,,,,,,,,317.0,31.0,,,,,,,,,,,,,2.0,PS,,,,,,,,,,,,,,,,,,,,,,,,,,,10.0,LT50,,,,,,,,,,,,,139.0,15-19,,,,,,,,,,,,,8.0,LT50,,,,,,,,,,,,,158.0,40-44,,,,,,,,,,,,,145.0,35-39,,,,,,,,,,,,,172.0,20-24,,,,,,,,,,,,,22.0,LE20,,,,,,,,,,,,,96.0,25-29,,,,,,,,,,,,,1.0,PS,,,,,,,,,,,,,35.0,LE10,,,,,,,,,,,,,3.0,PS,,,,,,,,,,,,,9.0,LT50,,,,,,,,,,,,,7.0,LT50
2,ALABAMA,1,Albertville City,100005,AL-101,10000500879,AL-101-0110,Evans Elementary School,13AUG20,872,39,2.0,PS,3.0,PS,25,21-39,453,24,27,21-39,362,60,433,43,439,36,70,15-19,380,30,3,PS,148,LE5,9,GE50,30.0,LE20,20.0,60-79,,,,,434.0,39,438.0,39,,,,,,,,,,,1.0,PS,1.0,PS,,,,,,,,,,,,,3.0,PS,,,,,,,,,,,14.0,LT50,11.0,GE50,,,,,,,,,,,218.0,20-24,235.0,20-24,,,,,,,,,,,15.0,LT50,12.0,LT50,,,,,,,,,,,186.0,55-59,176.0,60-64,,,,,,,,,,,211.0,40-44,222.0,40-44,,,,,,,,,,,223.0,35-39,216.0,35-39,,,,,,,,,,,41.0,11-19,29.0,LE20,,,,,,,,,,,189.0,30-34,191.0,30-34,,,,,,,,,,,3.0,PS,,,,,,,,,,,,,84.0,LE5,64.0,LE5,,,,,,,,,,,4.0,PS,5.0,PS,,,,,,,,,,,17.0,LE20,13.0,LT50,,,,,,,,,,,9.0,GE50,11.0,GE50,,,,,,
3,ALABAMA,1,Albertville City,100005,AL-101,10000500889,AL-101-0200,Albertville Elementary School,13AUG20,884,41,4.0,PS,5.0,PS,25,21-39,463,25,34,50-59,353,59,445,42,439,39,86,20-24,378,28,1,PS,200,10-14,3,PS,28.0,21-39,23.0,21-39,442.0,40.0,442.0,41.0,,,,,,,,,,,1.0,PS,3.0,PS,,,,,,,,,,,2.0,PS,3.0,PS,,,,,,,,,,,12.0,LT50,13.0,LT50,,,,,,,,,,,246.0,25-29,217.0,20-24,,,,,,,,,,,15.0,GE50,19.0,40-59,,,,,,,,,,,166.0,55-59,187.0,60-64,,,,,,,,,,,223.0,35-39,222.0,45-49,,,,,,,,,,,219.0,40-44,220.0,35-39,,,,,,,,,,,46.0,30-39,40.0,11-19,,,,,,,,,,,196.0,25-29,182.0,30-34,,,,,,,,,,,,,1.0,PS,,,,,,,,,,,109.0,10-14,91.0,LE5,,,,,,,,,,,1.0,PS,2.0,PS,,,,,,,,,,,17.0,21-39,11.0,LT50,,,,,,,,,,,13.0,LT50,10.0,LT50,,,,,,,,,,
4,ALABAMA,1,Marshall County,100006,AL-048,10000600193,AL-048-0143,Kate Duncan Smith DAR Middle,13AUG20,424,47,,,5.0,PS,5,PS,14,LT50,20,40-59,380,48,215,55-59,209,35-39,57,LE10,246,35-39,3,PS,4,PS,16,40-59,,,,,,,,,103.0,50-54,103.0,45-49,109.0,50-54,109.0,40-44,,,,,,,,,,,,,,,,,,,,,1.0,PS,1.0,PS,,,3.0,PS,,,,,,,1.0,PS,3.0,PS,,,1.0,PS,,,,,,,5.0,PS,3.0,PS,4.0,PS,2.0,PS,,,,,,,5.0,PS,5.0,PS,5.0,PS,5.0,PS,,,,,,,91.0,45-49,91.0,45-49,100.0,50-54,98.0,40-44,,,,,,,52.0,60-69,60.0,60-69,52.0,60-69,51.0,40-49,,,,,,,51.0,30-39,43.0,20-29,57.0,40-49,58.0,40-49,,,,,,,14.0,LT50,12.0,LT50,19.0,LE20,12.0,LT50,,,,,,,57.0,40-49,67.0,30-34,61.0,40-44,61.0,30-34,,,,,,,1.0,PS,,,1.0,PS,1.0,PS,,,,,,,1.0,PS,,,2.0,PS,1.0,PS,,,,,,,2.0,PS,5.0,PS,6.0,LT50,3.0,PS,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [85]:
target.shape

(89613, 265)

In [86]:
scores = target[['SCHNAM','LEAID','NCESSCH','ST_SCHID','ALL_RLA00pctprof_1819']]

In [87]:
scores.head()

Unnamed: 0,SCHNAM,LEAID,NCESSCH,ST_SCHID,ALL_RLA00pctprof_1819
0,Albertville Middle School,100005,10000500870,AL-101-0010,37
1,Albertville High School,100005,10000500871,AL-101-0020,31
2,Evans Elementary School,100005,10000500879,AL-101-0110,39
3,Albertville Elementary School,100005,10000500889,AL-101-0200,41
4,Kate Duncan Smith DAR Middle,100006,10000600193,AL-048-0143,47


In [88]:
merged4 = merged3.merge(scores,how='inner',on='ST_SCHID')

In [89]:
merged4.shape

(74192, 31)

In [90]:
#'PS' code used to suppress data for student privacy when testing groups are small.
# This is not useful for our analysis so we want to drop all rows with this character. 
merged4.value_counts(merged4['ALL_RLA00pctprof_1819'] == 'PS')

ALL_RLA00pctprof_1819
False    73328
True       864
dtype: int64

In [91]:
merged4 = merged4.loc[merged4['ALL_RLA00pctprof_1819'] != 'PS']

In [92]:
merged4['ALL_RLA00pctprof_1819'].unique()

array(['17', '20-24', '19', '25-29', '27', '33', '35-39', '18', '60-64',
       '67', '29', '39', '32', '30-34', '80-84', '60-79', '15-19', '26',
       '40', '36', '11', '55-59', '60', '68', '40-44', '20', '14', '31',
       '15', '50-54', '45-49', '60-69', 'GE95', '11-19', '35', '37', '22',
       '49', '44', '46', '47', '42', '41', '85', '82', '77', '83', '87',
       '90', '86', '80', '66', '70-74', '23', '48', '43', '54', '12',
       '10-14', '13', '25', '16', '51', '56', 'GE80', '34', '38', '84',
       'LT50', '75', '76', '70', '55', '45', '52', '21-39', '81', '74',
       '61', 'GE99', '20-29', '53', 'LE10', '75-79', '40-49', '28', '63',
       '62', '57', '65-69', '24', '5', '50', '40-59', '69', 'LE20', '10',
       '58', '59', '65', '92', '90-94', '97', 'GE90', '80-89', '21', '30',
       '71', '50-59', '85-89', '72', '94', '64', '30-39', 'LE5', '79',
       '88', '78', '73', 'GE50', '6-9', '9', '8', '6', '91', '70-79',
       '96', '89', '95', '7', '93', '98', '4', '2', 'LE

Now that the supressed data is removed from our dataframe, we can proceed with transforming our target column. Many of the proficiency results are ranges (e.g., 70-74). Because the consequences of falsely labeling a child as "Proficient" when they are, in fact, not, are largely thought of as more serious than falsely labeling a student as "Not Proficient" when they, in fact, are proficient, I have decided to take the lower end of these ranges as the assumed level of proficiency for that district.

We'll create a function that iterates through our target column and extracts the appropriate number from that value to represent the achivement level.

In [93]:
test = []
for i, element in enumerate(merged4['ALL_RLA00pctprof_1819']): 
    if ("E" in element) or  ("T" in element): 
        test.append((element[2:])) #Drop first two characters if there is an "E" in the value
    elif "6-9" == element: 
        test.append((element[:1]))
    elif "-" in element: 
        test.append((element[:2]))   
    else: 
        test.append(element)

In [94]:
set1 = set(test)

In [95]:
merged4['target'] = test

In [96]:
merged4[['ALL_RLA00pctprof_1819','target']].head(10)

Unnamed: 0,ALL_RLA00pctprof_1819,target
0,17,17
1,20-24,20
2,19,19
3,25-29,25
4,27,27
5,33,33
6,35-39,35
7,18,18
8,60-64,60
9,19,19


In [97]:
merged4.dtypes

NAME                      object
NCESID                     int64
LOCRPROP                   int64
PCTLTAXP                 float64
PPCSTOT                    int64
PPSPUPIL                   int64
PPSSTAFF                   int64
PPISALWG                   int64
ncesid_alpha                bool
zipcode                    int64
population                 int64
ST_SCHID                  object
STATENAME                 object
SCH_NAME                  object
LEAID_x                  float64
ST_LEAID                  object
TITLEI_STATUS             object
VIRTUAL                   object
LSTREET1                  object
LCITY                     object
LSTATE                    object
LZIP                       int64
LEVEL                     object
Total Students            object
Student:Teacher Ratio     object
density                   object
median_income             object
SCHNAM                    object
LEAID_y                    int64
NCESSCH                    int64
ALL_RLA00p

In [98]:
test2 = merged4.copy().reset_index()

test2['target_conv'] = 0

for i, number in enumerate(test2['target']):
    if int(number) >= 50: 
        test2.at[i,'target_conv'] = 1 
    else: 
        test2.at[i,'target_conv'] = 0

In [99]:
set2 = set(test2['target_conv'])

In [100]:
test2.head()

Unnamed: 0,index,NAME,NCESID,LOCRPROP,PCTLTAXP,PPCSTOT,PPSPUPIL,PPSSTAFF,PPISALWG,ncesid_alpha,zipcode,population,ST_SCHID,STATENAME,SCH_NAME,LEAID_x,ST_LEAID,TITLEI_STATUS,VIRTUAL,LSTREET1,LCITY,LSTATE,LZIP,LEVEL,Total Students,Student:Teacher Ratio,density,median_income,SCHNAM,LEAID_y,NCESSCH,ALL_RLA00pctprof_1819,target,target_conv
0,0,AUTAUGA COUNTY SCHOOL DISTRICT,100240,7637,8.7,8600,512,241,3357,True,6704,75417,CT-1510011-1516311,CONNECTICUT,Wilby High School,100240.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,568 Bucks Hill Rd.,Waterbury,CT,6704,High,1091,14.47,midsize,40625,Wilby High School,904830,90483000992,17,17,0
1,1,AUTAUGA COUNTY SCHOOL DISTRICT,100240,7637,8.7,8600,512,241,3357,True,6704,75417,CT-1510011-1513211,CONNECTICUT,Woodrow Wilson School,100240.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,235 Birch St.,Waterbury,CT,6704,Elementary,414,13.89,midsize,40625,Woodrow Wilson School,904830,90483000685,20-24,20,0
2,2,DECATUR CITY SCHOOL DISTRICT,101170,19527,17.5,11406,590,436,4418,True,6704,75417,CT-1510011-1512011,CONNECTICUT,Sprague School,101170.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,1443 Thomaston Ave.,Waterbury,CT,6704,Elementary,388,11.9,midsize,40625,Sprague School,904830,90483000986,19,19,0
3,3,GENEVA AREA CITY SCH DIST,3904405,8456,33.4,9876,614,80,4060,True,6704,75417,CT-1510011-1510511,CONNECTICUT,Bucks Hill School,3904405.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,330 Bucks Hill Rd.,Waterbury,CT,6704,Elementary,312,8.13,midsize,40625,Bucks Hill School,904830,90483000968,25-29,25,0
4,4,AUSTINTOWN SCH DIST,3904829,19943,35.7,10253,694,290,3987,True,6704,75417,CT-1510011-1515311,CONNECTICUT,North End Middle School,3904829.0,CT-1510011,SWELIGSWPROG,SUPPVIRTUAL,534 Bucks Hill Rd.,Waterbury,CT,6704,Middle,896,11.55,midsize,40625,North End Middle School,904830,90483001115,27,27,0


In [101]:
test2.shape

(73328, 34)

In [102]:
#Final Cleanup 
final = test2[['ST_SCHID','SCH_NAME','LEVEL','NAME','LEAID_y','LCITY','LSTATE','zipcode','population','density','median_income','LOCRPROP','PCTLTAXP','PPCSTOT','PPSPUPIL','PPSSTAFF','PPISALWG','Total Students','Student:Teacher Ratio','TITLEI_STATUS','VIRTUAL','target','target_conv']]


In [103]:
final = final.rename(columns = {'ST_SCHID': 'School_ID','SCH_NAME':'School_Name','LEVEL':'Level','NAME':'District_Name','LEAID_y':'District_ID','LCITY':'City','LSTATE':'State','zipcode':'Zip','population':'Zip_Population','density':'Density','median_income':'Median_Income','LOCRPROP':'Property_Taxes','PCTLTAXP':'Percent_Taxes','PPCSTOT':'Per_Pupil_Spending','PPSPUPIL':'Per_Pupil_Support','PPSSTAFF':'Per_Pupil_Support_Staff',"PPISALWG":'Per_Pupil_Salaries','Total Students':'Total_Students','Student:Teacher Ratio':'Student_Teacher_Ratio','TITLEI_STATUS':'Title_I','VIRTUAL':'Virtual','target':'%_Proficient_Reading','target_conv':'Target'})



In [104]:
states = final['State'].value_counts().to_dict()

In [105]:
len(states)

43

In [106]:
final['Title_I'].value_counts()

SWELIGSWPROG     31372
NOTTITLE1ELIG    15282
Not reported     12717
SWELIGNOPROG      5260
TGELGBTGPROG      4078
TGELGBNOPROG      2488
SWELIGTGPROG      2112
MISSING             19
Name: Title_I, dtype: int64

In [107]:
final['title_1'] = np.nan

In [108]:
# Enumerate to replace values in ['Title_I'] with eligibility category
for i, element in enumerate(final['Title_I']): 
    if element == 'NOTTITLE1ELIG':
        final['title_1'][i] = 'no'
    elif element == 'Not reported': 
        final['title_1'][i] = 'no'
    elif element == 'MISSING': 
        final['title_1'][i] = 'no'
    else:
        final['title_1'][i] = 'yes' 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['title_1'][i] = 'yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['title_1'][i] = 'no'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['title_1'][i] = 'no'
A value is trying to be set on a copy of a 

In [109]:
final.head()

Unnamed: 0,School_ID,School_Name,Level,District_Name,District_ID,City,State,Zip,Zip_Population,Density,Median_Income,Property_Taxes,Percent_Taxes,Per_Pupil_Spending,Per_Pupil_Support,Per_Pupil_Support_Staff,Per_Pupil_Salaries,Total_Students,Student_Teacher_Ratio,Title_I,Virtual,%_Proficient_Reading,Target,title_1
0,CT-1510011-1516311,Wilby High School,High,AUTAUGA COUNTY SCHOOL DISTRICT,904830,Waterbury,CT,6704,75417,midsize,40625,7637,8.7,8600,512,241,3357,1091,14.47,SWELIGSWPROG,SUPPVIRTUAL,17,0,yes
1,CT-1510011-1513211,Woodrow Wilson School,Elementary,AUTAUGA COUNTY SCHOOL DISTRICT,904830,Waterbury,CT,6704,75417,midsize,40625,7637,8.7,8600,512,241,3357,414,13.89,SWELIGSWPROG,SUPPVIRTUAL,20,0,yes
2,CT-1510011-1512011,Sprague School,Elementary,DECATUR CITY SCHOOL DISTRICT,904830,Waterbury,CT,6704,75417,midsize,40625,19527,17.5,11406,590,436,4418,388,11.9,SWELIGSWPROG,SUPPVIRTUAL,19,0,yes
3,CT-1510011-1510511,Bucks Hill School,Elementary,GENEVA AREA CITY SCH DIST,904830,Waterbury,CT,6704,75417,midsize,40625,8456,33.4,9876,614,80,4060,312,8.13,SWELIGSWPROG,SUPPVIRTUAL,25,0,yes
4,CT-1510011-1515311,North End Middle School,Middle,AUSTINTOWN SCH DIST,904830,Waterbury,CT,6704,75417,midsize,40625,19943,35.7,10253,694,290,3987,896,11.55,SWELIGSWPROG,SUPPVIRTUAL,27,0,yes


In [110]:
final['Virtual'].value_counts()

NOTVIRTUAL      36628
SUPPVIRTUAL     16604
Not reported    12717
MISSING          6667
FACEVIRTUAL       388
FULLVIRTUAL       324
Name: Virtual, dtype: int64

In [111]:
final['virtual'] = np.nan

In [112]:
# Enumerate to replace values in ['Virtual'] with virtual instruction status
for i, element in enumerate(final['Virtual']): 
    if element == 'NOTVIRTUAL':
        final['virtual'][i] = 'no'
    elif element == 'Not reported': 
        final['virtual'][i] = 'no'
    elif element == 'MISSING': 
        final['virtual'][i] = 'no'
    else:
        final['virtual'][i] = 'yes' 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['virtual'][i] = 'yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['virtual'][i] = 'no'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['virtual'][i] = 'no'
A value is trying to be set on a copy of a 

In [113]:
final = final.drop(['Title_I','Virtual'], axis=1)

In [114]:
final = final.rename(columns={'title_1':'Title_I','virtual':'Virtual'})

In [115]:
final.head()

Unnamed: 0,School_ID,School_Name,Level,District_Name,District_ID,City,State,Zip,Zip_Population,Density,Median_Income,Property_Taxes,Percent_Taxes,Per_Pupil_Spending,Per_Pupil_Support,Per_Pupil_Support_Staff,Per_Pupil_Salaries,Total_Students,Student_Teacher_Ratio,%_Proficient_Reading,Target,Title_I,Virtual
0,CT-1510011-1516311,Wilby High School,High,AUTAUGA COUNTY SCHOOL DISTRICT,904830,Waterbury,CT,6704,75417,midsize,40625,7637,8.7,8600,512,241,3357,1091,14.47,17,0,yes,yes
1,CT-1510011-1513211,Woodrow Wilson School,Elementary,AUTAUGA COUNTY SCHOOL DISTRICT,904830,Waterbury,CT,6704,75417,midsize,40625,7637,8.7,8600,512,241,3357,414,13.89,20,0,yes,yes
2,CT-1510011-1512011,Sprague School,Elementary,DECATUR CITY SCHOOL DISTRICT,904830,Waterbury,CT,6704,75417,midsize,40625,19527,17.5,11406,590,436,4418,388,11.9,19,0,yes,yes
3,CT-1510011-1510511,Bucks Hill School,Elementary,GENEVA AREA CITY SCH DIST,904830,Waterbury,CT,6704,75417,midsize,40625,8456,33.4,9876,614,80,4060,312,8.13,25,0,yes,yes
4,CT-1510011-1515311,North End Middle School,Middle,AUSTINTOWN SCH DIST,904830,Waterbury,CT,6704,75417,midsize,40625,19943,35.7,10253,694,290,3987,896,11.55,27,0,yes,yes


In [116]:
final = final[['School_ID','School_Name','Level','District_Name','District_ID','City','State','Zip','Zip_Population','Density','Median_Income','Property_Taxes','Percent_Taxes','Per_Pupil_Spending','Per_Pupil_Support','Per_Pupil_Support_Staff','Per_Pupil_Salaries','Total_Students','Student_Teacher_Ratio','Title_I','Virtual','%_Proficient_Reading','Target']]


In [117]:
final.head()

Unnamed: 0,School_ID,School_Name,Level,District_Name,District_ID,City,State,Zip,Zip_Population,Density,Median_Income,Property_Taxes,Percent_Taxes,Per_Pupil_Spending,Per_Pupil_Support,Per_Pupil_Support_Staff,Per_Pupil_Salaries,Total_Students,Student_Teacher_Ratio,Title_I,Virtual,%_Proficient_Reading,Target
0,CT-1510011-1516311,Wilby High School,High,AUTAUGA COUNTY SCHOOL DISTRICT,904830,Waterbury,CT,6704,75417,midsize,40625,7637,8.7,8600,512,241,3357,1091,14.47,yes,yes,17,0
1,CT-1510011-1513211,Woodrow Wilson School,Elementary,AUTAUGA COUNTY SCHOOL DISTRICT,904830,Waterbury,CT,6704,75417,midsize,40625,7637,8.7,8600,512,241,3357,414,13.89,yes,yes,20,0
2,CT-1510011-1512011,Sprague School,Elementary,DECATUR CITY SCHOOL DISTRICT,904830,Waterbury,CT,6704,75417,midsize,40625,19527,17.5,11406,590,436,4418,388,11.9,yes,yes,19,0
3,CT-1510011-1510511,Bucks Hill School,Elementary,GENEVA AREA CITY SCH DIST,904830,Waterbury,CT,6704,75417,midsize,40625,8456,33.4,9876,614,80,4060,312,8.13,yes,yes,25,0
4,CT-1510011-1515311,North End Middle School,Middle,AUSTINTOWN SCH DIST,904830,Waterbury,CT,6704,75417,midsize,40625,19943,35.7,10253,694,290,3987,896,11.55,yes,yes,27,0


In [118]:
final.dtypes

School_ID                   object
School_Name                 object
Level                       object
District_Name               object
District_ID                  int64
City                        object
State                       object
Zip                          int64
Zip_Population               int64
Density                     object
Median_Income               object
Property_Taxes               int64
Percent_Taxes              float64
Per_Pupil_Spending           int64
Per_Pupil_Support            int64
Per_Pupil_Support_Staff      int64
Per_Pupil_Salaries           int64
Total_Students              object
Student_Teacher_Ratio       object
Title_I                     object
Virtual                     object
%_Proficient_Reading        object
Target                       int64
dtype: object

In [119]:
final['Zip'] = final['Zip'].astype(str)

In [120]:
dashes = final.loc[final['Median_Income'] == '-']

In [121]:
dashes.shape

(285, 23)

In [122]:
#Discovered an issue with median income - some zipcodes were missing income.
#As these zipcodes are from all over the place it'll take too much time to 
#Look up each median income manually. I'm going to drop the rows instead 

final = final[final.Median_Income != '-']

In [123]:
rich = final.loc[final['Median_Income'] == '250,000+']

In [124]:
rich.shape

(23, 23)

In [125]:
final['Median_Income'] = final.Median_Income.str.rstrip('+').str.replace(",","")

In [126]:
final = final.dropna()

In [127]:
final['Median_Income'] = final['Median_Income'].astype(int)

In [128]:
final.dtypes

School_ID                   object
School_Name                 object
Level                       object
District_Name               object
District_ID                  int64
City                        object
State                       object
Zip                         object
Zip_Population               int64
Density                     object
Median_Income                int64
Property_Taxes               int64
Percent_Taxes              float64
Per_Pupil_Spending           int64
Per_Pupil_Support            int64
Per_Pupil_Support_Staff      int64
Per_Pupil_Salaries           int64
Total_Students              object
Student_Teacher_Ratio       object
Title_I                     object
Virtual                     object
%_Proficient_Reading        object
Target                       int64
dtype: object

In [129]:
final.head()

Unnamed: 0,School_ID,School_Name,Level,District_Name,District_ID,City,State,Zip,Zip_Population,Density,Median_Income,Property_Taxes,Percent_Taxes,Per_Pupil_Spending,Per_Pupil_Support,Per_Pupil_Support_Staff,Per_Pupil_Salaries,Total_Students,Student_Teacher_Ratio,Title_I,Virtual,%_Proficient_Reading,Target
0,CT-1510011-1516311,Wilby High School,High,AUTAUGA COUNTY SCHOOL DISTRICT,904830,Waterbury,CT,6704,75417,midsize,40625,7637,8.7,8600,512,241,3357,1091,14.47,yes,yes,17,0
1,CT-1510011-1513211,Woodrow Wilson School,Elementary,AUTAUGA COUNTY SCHOOL DISTRICT,904830,Waterbury,CT,6704,75417,midsize,40625,7637,8.7,8600,512,241,3357,414,13.89,yes,yes,20,0
2,CT-1510011-1512011,Sprague School,Elementary,DECATUR CITY SCHOOL DISTRICT,904830,Waterbury,CT,6704,75417,midsize,40625,19527,17.5,11406,590,436,4418,388,11.9,yes,yes,19,0
3,CT-1510011-1510511,Bucks Hill School,Elementary,GENEVA AREA CITY SCH DIST,904830,Waterbury,CT,6704,75417,midsize,40625,8456,33.4,9876,614,80,4060,312,8.13,yes,yes,25,0
4,CT-1510011-1515311,North End Middle School,Middle,AUSTINTOWN SCH DIST,904830,Waterbury,CT,6704,75417,midsize,40625,19943,35.7,10253,694,290,3987,896,11.55,yes,yes,27,0


In [130]:
final.shape

(73043, 23)

The final, cleaned dataset has features with appropriate datatypes, is free of nulls and contains data for over 73,000 schools in 43 states. 

In [131]:
final.to_csv('data/final.csv')