# Data cleaning

Picking out the relevant variables from both the demographic/geographic data and merging them into a clean dataset. 

One dataset has demographic information on Houston ISD (from the TEA), and another has the geographic information from the TEA. 

We will also merge in the school end time information here

## Setting things up

First, let's get our working directories in place and import the necessary libraries:

In [46]:
import numpy as np
import pandas as pd
import datetime

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

raw_data = "//Users//afan//Desktop//Misc//HMW_Transit//prep//raw_data//"
cleaned_data  = "//Users//afan//Desktop//Misc//HMW_Transit//cleaned_data//"

## Relevant Variables

I want to look at these three datasets in particular: 

1. student_demographics
2. school_geography
3. start_end_times.xlsx

One at a time:

## student_demographics.csv 

taken from the TEA

In [2]:
sd_str = raw_data + "student_demographics.csv"
sd = pd.read_csv(sd_str)
print(sd.shape)
sd.head()

(274, 52)


Unnamed: 0,CAMPUS,CAMPNAME,CPET504C,CPET504P,CPETALLC,CPETASIC,CPETASIP,CPETATTC,CPETATTD,CPETATTP,CPETBLAC,CPETBLAP,CPETDISC,CPETDISP,CPETDSLC,CPETDSLP,CPETECOC,CPETECOP,CPETFEMC,CPETFEMP,CPETFOSC,CPETFOSP,CPETHISC,CPETHISP,CPETHOMC,CPETHOMP,CPETIMMC,CPETIMMP,CPETINDC,CPETINDP,CPETLEPC,CPETLEPP,CPETMALC,CPETMALP,CPETMIGC,CPETMIGP,CPETMLCC,CPETMLCP,CPETNEDC,CPETNEDP,CPETPCIC,CPETPCIP,CPETRSKC,CPETRSKP,CPETTT1C,CPETTT1P,CPETTWOC,CPETTWOP,CPETWHIC,CPETWHIP,DISTNAME,DISTRICT
0,'101912001,AUSTIN H S,45,3.0,1510,1,0.1,212,1299,16.3,123,8.1,41,2.4,32,2.1,1447,95.8,678,44.9,0,0.0,1374,91.0,26,1.7,49,3.2,2,0.1,494,32.7,832,55.1,1,0.1,0,0.0,63,4.2,1,0.1,1175,77.8,1510,100.0,2,0.1,7,0.5,HOUSTON ISD,'101912
1,'101912002,BELLAIRE H S,226,7.0,3213,402,12.5,351,2716,12.9,672,20.9,71,2.0,122,3.8,1436,44.7,1641,51.1,0,0.0,1320,41.1,6,0.2,89,2.8,9,0.3,436,13.6,1572,48.9,0,0.0,0,0.0,1777,55.3,5,0.2,1246,38.8,3213,100.0,74,2.3,731,22.8,HOUSTON ISD,'101912
2,'101912003,NORTHSIDE H S,45,3.1,1431,0,0.0,162,1165,13.9,212,14.8,16,1.0,42,2.9,1337,93.4,691,48.3,0,0.0,1204,84.1,5,0.3,10,0.7,3,0.2,358,25.0,740,51.7,0,0.0,0,0.0,94,6.6,0,0.0,896,62.6,1431,100.0,2,0.1,10,0.7,HOUSTON ISD,'101912
3,'101912004,FURR H S,25,2.3,1110,5,0.5,174,922,18.9,200,18.0,37,3.0,34,3.1,1068,96.2,541,48.7,5,0.5,883,79.5,27,2.4,90,8.1,3,0.3,310,27.9,569,51.3,2,0.2,0,0.0,42,3.8,0,0.0,691,62.3,1110,100.0,4,0.4,15,1.4,HOUSTON ISD,'101912
4,'101912006,JONES FUTURES ACADEMY,9,2.6,341,0,0.0,34,258,13.2,131,38.4,1,0.3,4,1.2,317,93.0,216,63.3,0,0.0,203,59.5,4,1.2,9,2.6,1,0.3,61,17.9,125,36.7,0,0.0,0,0.0,24,7.0,0,0.0,190,55.7,341,100.0,2,0.6,4,1.2,HOUSTON ISD,'101912


In [3]:
sd_c = sd.iloc[:, [item[-1]!= 'P' for item in sd.columns]]

In [4]:
sd_c.columns

Index(['CAMPUS', 'CAMPNAME', 'CPET504C', 'CPETALLC', 'CPETASIC', 'CPETATTC',
       'CPETATTD', 'CPETBLAC', 'CPETDISC', 'CPETDSLC', 'CPETECOC', 'CPETFEMC',
       'CPETFOSC', 'CPETHISC', 'CPETHOMC', 'CPETIMMC', 'CPETINDC', 'CPETLEPC',
       'CPETMALC', 'CPETMIGC', 'CPETMLCC', 'CPETNEDC', 'CPETPCIC', 'CPETRSKC',
       'CPETTT1C', 'CPETTWOC', 'CPETWHIC', 'DISTNAME', 'DISTRICT'],
      dtype='object')

In [5]:
sd_final = sd_c.rename(columns = {'CPET504C' : 'D504',
'CPETALLC': 'All', 
'CPETASIC': 'Asian', 
'CPETATTC': 'Attrition',
'CPETATTD': 'Attrition_Denom', 
'CPETBLAC': 'Black', 
'CPETDISC': 'DAEP', 
'CPETDSLC': 'Dyslexia' , 
'CPETECOC': 'Econ_Disadv', 
'CPETFEMC': 'Female',
'CPETFOSC': 'Foster_Care', 
'CPETHISC': 'Hispanic', 
'CPETHOMC': 'Homeless', 
'CPETIMMC': 'Immigrant', 
'CPETINDC': 'Am_Ind', 
'CPETLEPC': 'Eng_Learner',
'CPETMALC': 'Male', 
'CPETMIGC': 'Migrant', 
'CPETMLCC': 'Military_Conn', 
'CPETNEDC': 'Non_Ed_Disadv', 
'CPETPCIC': 'Pacific_Is', 
'CPETRSKC': 'At_Risk',
'CPETTT1C': 'Title_I', 
'CPETTWOC': 'Two_Or_More', 
'CPETWHIC': 'White'})

In [6]:
sd_final.head()

Unnamed: 0,CAMPUS,CAMPNAME,D504,All,Asian,Attrition,Attrition_Denom,Black,DAEP,Dyslexia,Econ_Disadv,Female,Foster_Care,Hispanic,Homeless,Immigrant,Am_Ind,Eng_Learner,Male,Migrant,Military_Conn,Non_Ed_Disadv,Pacific_Is,At_Risk,Title_I,Two_Or_More,White,DISTNAME,DISTRICT
0,'101912001,AUSTIN H S,45,1510,1,212,1299,123,41,32,1447,678,0,1374,26,49,2,494,832,1,0,63,1,1175,1510,2,7,HOUSTON ISD,'101912
1,'101912002,BELLAIRE H S,226,3213,402,351,2716,672,71,122,1436,1641,0,1320,6,89,9,436,1572,0,0,1777,5,1246,3213,74,731,HOUSTON ISD,'101912
2,'101912003,NORTHSIDE H S,45,1431,0,162,1165,212,16,42,1337,691,0,1204,5,10,3,358,740,0,0,94,0,896,1431,2,10,HOUSTON ISD,'101912
3,'101912004,FURR H S,25,1110,5,174,922,200,37,34,1068,541,5,883,27,90,3,310,569,2,0,42,0,691,1110,4,15,HOUSTON ISD,'101912
4,'101912006,JONES FUTURES ACADEMY,9,341,0,34,258,131,1,4,317,216,0,203,4,9,1,61,125,0,0,24,0,190,341,2,4,HOUSTON ISD,'101912


## school_geography.csv

In [7]:
sg_str = raw_data + "school_geography.csv"
sg = pd.read_csv(sg_str)
print(sg.shape)
sg.head()

(275, 116)


Unnamed: 0,OBJECTID,Loc_name,Status,Score,Match_type,Match_addr,LongLabel,ShortLabel,Addr_type,Type,PlaceName,Place_addr,Phone,URL,Rank,AddBldg,AddNum,AddNumFrom,AddNumTo,AddRange,Side,StPreDir,StPreType,StName,StType,StDir,BldgType,BldgName,LevelType,LevelName,UnitType,UnitName,SubAddr,StAddr,Block,Sector,Nbrhd,District,City,MetroArea,Subregion,Region,RegionAbbr,Territory,Zone,Postal,PostalExt,Country,LangCode,Distance,X,Y,DisplayX,DisplayY,Xmin,Xmax,Ymin,Ymax,ExInfo,ARC_Addres,ARC_Addr_1,ARC_Addr_2,ARC_Neighb,ARC_City,ARC_Subreg,ARC_Region,ARC_Postal,ARC_Post_1,ARC_Countr,County_Num,County_Nam,ESC_Region,ESC_Regi_1,ESC_Regi_2,District_N,District_1,District_T,District_S,District_C,District_2,District_Z,District_3,District_4,District_5,District_6,District_P,District_F,District_E,District_W,District_7,District_8,School_Num,School_Nam,Instructio,Charter_Ty,School_Str,School_Cit,School_Sta,School_Zip,School_Sit,School_S_1,School_S_2,School_S_3,Zip_5,School_Pho,School_Fax,School_Ema,School_Web,School_Pri,Grade_Rang,School_Enr,School_S_4,School_S_5,Performanc,GlobalID,School_Type
0,4543,World,M,98.82,A,"1700 Dumble St, Houston, Texas, 77023","1700 Dumble St, Houston, TX, 77023, USA",1700 Dumble St,PointAddress,,,"1700 Dumble St, Houston, Texas, 77023",,,20,,1700,,,,,,,Dumble,St,,,,,,,,,1700 Dumble St,,,,,Houston,,Harris County,Texas,TX,,,77023,3139,USA,ENG,0,-95.331465,29.730194,-95.333206,29.730909,-95.334206,-95.332206,29.729909,29.731909,,1700 DUMBLE,,,,HOUSTON,,TX,77023,,,'101,HARRIS COUNTY,'04,'04,'04,'101912,HOUSTON ISD,INDEPENDENT,4400 W 18TH ST,HOUSTON,TX,77092-8501,4400 W 18TH ST,HOUSTON,TX,77092-8501,(713) 556-6005,(713) 556-6006,HISDSuperintendent@houstonisd.org,www.houstonisd.org,MR MILLARD L HOUSE II,196943,'101912001,AUSTIN H S,REGULAR INSTRUCTIONAL,,1700 DUMBLE,HOUSTON,TX,77023-3139,1700 DUMBLE,HOUSTON,TX,77023-3139,77023,(713) 924-1600,(713) 923-3157,OREYNA2@houstonisd.org,www.houstonisd.org,MR ORLANDO REYNA,'09-12,1520,Active,,https://txschools.gov/schools/101912001/overview,6044703f-c35c-4dce-a611-f0ac667eab65,
1,4544,World,M,98.82,A,"5100 Maple St, Bellaire, Texas, 77401","5100 Maple St, Bellaire, TX, 77401, USA",5100 Maple St,PointAddress,,,"5100 Maple St, Bellaire, Texas, 77401",,,20,,5100,,,,,,,Maple,St,,,,,,,,,5100 Maple St,,,,,Bellaire,,Harris County,Texas,TX,,,77401,4936,USA,ENG,0,-95.469016,29.691198,-95.469016,29.691969,-95.470016,-95.468016,29.690969,29.692969,,5100 MAPLE,,,,BELLAIRE,,TX,77401,,,'101,HARRIS COUNTY,'04,'04,'04,'101912,HOUSTON ISD,INDEPENDENT,4400 W 18TH ST,HOUSTON,TX,77092-8501,4400 W 18TH ST,HOUSTON,TX,77092-8501,(713) 556-6005,(713) 556-6006,HISDSuperintendent@houstonisd.org,www.houstonisd.org,MR MILLARD L HOUSE II,196943,'101912002,BELLAIRE H S,REGULAR INSTRUCTIONAL,,5100 MAPLE,BELLAIRE,TX,77401-4936,5100 MAPLE,BELLAIRE,TX,77401-4936,77401,(713) 295-3704,(713) 295-3763,mmcdonou@houstonisd.org,www.houstonisd.org,MR MICHAEL MCDONOUGH,'09-12,3218,Active,,https://txschools.gov/schools/101912002/overview,26e69755-c853-44dd-886e-26cfec353a6d,
2,4545,World,M,98.82,A,"1101 Quitman St, Houston, Texas, 77009","1101 Quitman St, Houston, TX, 77009, USA",1101 Quitman St,PointAddress,,,"1101 Quitman St, Houston, Texas, 77009",,,20,,1101,,,,,,,Quitman,St,,,,,,,,,1101 Quitman St,,,,,Houston,,Harris County,Texas,TX,,,77009,7815,USA,ENG,0,-95.357984,29.782934,-95.358229,29.783637,-95.359229,-95.357229,29.782637,29.784637,,1101 QUITMAN,,,,HOUSTON,,TX,77009,,,'101,HARRIS COUNTY,'04,'04,'04,'101912,HOUSTON ISD,INDEPENDENT,4400 W 18TH ST,HOUSTON,TX,77092-8501,4400 W 18TH ST,HOUSTON,TX,77092-8501,(713) 556-6005,(713) 556-6006,HISDSuperintendent@houstonisd.org,www.houstonisd.org,MR MILLARD L HOUSE II,196943,'101912003,NORTHSIDE H S,REGULAR INSTRUCTIONAL,,1101 QUITMAN,HOUSTON,TX,77009-7815,1101 QUITMAN,HOUSTON,TX,77009-7815,77009,(713) 226-4900,(713) 226-4999,CGONZAL1@HOUSTONISD.ORG,www.houstonisd.org,MS CECILIA A GONZALES,'09-12,1431,Active,,https://txschools.gov/schools/101912003/overview,e53ea622-d54c-4c1d-82a5-d27abe9bcd31,
3,4546,World,M,98.82,A,"520 Mercury Dr, Houston, Texas, 77013","520 Mercury Dr, Houston, TX, 77013, USA",520 Mercury Dr,PointAddress,,,"520 Mercury Dr, Houston, Texas, 77013",,,20,,520,,,,,,,Mercury,Dr,,,,,,,,,520 Mercury Dr,,,,,Houston,,Harris County,Texas,TX,,,77013,5217,USA,ENG,0,-95.246987,29.778996,-95.248809,29.779435,-95.249809,-95.247809,29.778435,29.780435,,520 MERCURY,,,,HOUSTON,,TX,77013,,,'101,HARRIS COUNTY,'04,'04,'04,'101912,HOUSTON ISD,INDEPENDENT,4400 W 18TH ST,HOUSTON,TX,77092-8501,4400 W 18TH ST,HOUSTON,TX,77092-8501,(713) 556-6005,(713) 556-6006,HISDSuperintendent@houstonisd.org,www.houstonisd.org,MR MILLARD L HOUSE II,196943,'101912004,FURR H S,REGULAR INSTRUCTIONAL,,520 MERCURY,HOUSTON,TX,77013-5217,520 MERCURY,HOUSTON,TX,77013-5217,77013,(713) 675-1118,(713) 671-3612,tmoran@houstonisd.org,www.houstonisd.org,MS TAMMIE MORAN,'09-12,1113,Active,,https://txschools.gov/schools/101912004/overview,c8186eb7-32af-4eab-b116-de7a5063e576,
4,4547,World,M,100.0,A,"7414 St Lo Rd, Houston, Texas, 77033","7414 St Lo Rd, Houston, TX, 77033, USA",7414 St Lo Rd,PointAddress,,,"7414 St Lo Rd, Houston, Texas, 77033",,,20,,7414,,,,,,,St Lo,Rd,,,,,,,,,7414 St Lo Rd,,,,,Houston,,Harris County,Texas,TX,,,77033,2732,USA,ENG,0,-95.339505,29.674387,-95.340599,29.674679,-95.341599,-95.339599,29.673679,29.675679,,7414 ST LO RD,,,,HOUSTON,,TX,77033,,,'101,HARRIS COUNTY,'04,'04,'04,'101912,HOUSTON ISD,INDEPENDENT,4400 W 18TH ST,HOUSTON,TX,77092-8501,4400 W 18TH ST,HOUSTON,TX,77092-8501,(713) 556-6005,(713) 556-6006,HISDSuperintendent@houstonisd.org,www.houstonisd.org,MR MILLARD L HOUSE II,196943,'101912006,JONES FUTURES ACADEMY,REGULAR INSTRUCTIONAL,,7414 ST LO RD,HOUSTON,TX,77033-2797,7414 ST LO RD,HOUSTON,TX,77033-2797,77033,(713) 733-1111,(713) 732-3450,nlim@houstonisd.org,www.houstonisd.org,MS NIRMOL LIM,'09-12,342,Active,,https://txschools.gov/schools/101912006/overview,dcd24ec7-6020-4966-8b22-f787594b92d5,


In [8]:
sg_cols = ['School_Num', 'School_Nam', 'Place_addr', 'Grade_Rang', 'X', 'Y']
sg_final = sg[sg_cols]
sg_final.head()

Unnamed: 0,School_Num,School_Nam,Place_addr,Grade_Rang,X,Y
0,'101912001,AUSTIN H S,"1700 Dumble St, Houston, Texas, 77023",'09-12,-95.331465,29.730194
1,'101912002,BELLAIRE H S,"5100 Maple St, Bellaire, Texas, 77401",'09-12,-95.469016,29.691198
2,'101912003,NORTHSIDE H S,"1101 Quitman St, Houston, Texas, 77009",'09-12,-95.357984,29.782934
3,'101912004,FURR H S,"520 Mercury Dr, Houston, Texas, 77013",'09-12,-95.246987,29.778996
4,'101912006,JONES FUTURES ACADEMY,"7414 St Lo Rd, Houston, Texas, 77033",'09-12,-95.339505,29.674387


## start_end_times.xlsx

In [15]:
# excel sheet with schools end times -- the information on all the schools is on the second sheet 
et_str = raw_data + "start_end_times.xlsx"
et_cols = ['Campus Short Name', 'End Time']

et = pd.read_excel(et_str, 1, usecols = et_cols, parse_dates = ['End Time'])

et.head()

(274, 2)


Unnamed: 0,Campus Short Name,End Time
0,Alcott ES,2022-01-11 15:00:00
1,Almeda ES,2022-01-11 15:00:00
2,Anderson ES,2022-01-11 15:00:00
3,Arabic Immersion,2022-01-11 15:00:00
4,Ashford ES,2022-01-11 15:00:00


## Merging everything together

Now, we should merge all three of these datasets together. 

To match school names to the TEA names though, we will have to read in the key. 

In [17]:
key_str = raw_data + "name_match_final.csv"
key = pd.read_csv(key_str)

In [39]:
stud_1 = sg_final.merge(key, how = 'outer', indicator = True)
stud_1.head()

Unnamed: 0,School_Num,School_Nam,Place_addr,Grade_Rang,X,Y,Campus Short Name,_merge
0,'101912001,AUSTIN H S,"1700 Dumble St, Houston, Texas, 77023",'09-12,-95.331465,29.730194,Austin HS,both
1,'101912002,BELLAIRE H S,"5100 Maple St, Bellaire, Texas, 77401",'09-12,-95.469016,29.691198,Bellaire HS,both
2,'101912003,NORTHSIDE H S,"1101 Quitman St, Houston, Texas, 77009",'09-12,-95.357984,29.782934,Northside HS,both
3,'101912004,FURR H S,"520 Mercury Dr, Houston, Texas, 77013",'09-12,-95.246987,29.778996,Furr HS,both
4,'101912006,JONES FUTURES ACADEMY,"7414 St Lo Rd, Houston, Texas, 77033",'09-12,-95.339505,29.674387,Jones HS,both


In [40]:
stud_2 = stud_1.drop("_merge", axis = 1).merge(et, how = 'outer', indicator = True)
stud_2.head()

Unnamed: 0,School_Num,School_Nam,Place_addr,Grade_Rang,X,Y,Campus Short Name,End Time,_merge
0,'101912001,AUSTIN H S,"1700 Dumble St, Houston, Texas, 77023",'09-12,-95.331465,29.730194,Austin HS,2022-01-11 15:30:00,both
1,'101912002,BELLAIRE H S,"5100 Maple St, Bellaire, Texas, 77401",'09-12,-95.469016,29.691198,Bellaire HS,2022-01-11 16:10:00,both
2,'101912003,NORTHSIDE H S,"1101 Quitman St, Houston, Texas, 77009",'09-12,-95.357984,29.782934,Northside HS,2022-01-11 16:10:00,both
3,'101912004,FURR H S,"520 Mercury Dr, Houston, Texas, 77013",'09-12,-95.246987,29.778996,Furr HS,2022-01-11 16:10:00,both
4,'101912006,JONES FUTURES ACADEMY,"7414 St Lo Rd, Houston, Texas, 77033",'09-12,-95.339505,29.674387,Jones HS,2022-01-11 15:30:00,both


In [41]:
stud_final = stud_2.drop("_merge", axis = 1).merge(sd_final, how = 'outer', left_on = ['School_Num'], right_on = ['CAMPUS'], indicator = True)
stud_final.head()

Unnamed: 0,School_Num,School_Nam,Place_addr,Grade_Rang,X,Y,Campus Short Name,End Time,CAMPUS,CAMPNAME,D504,All,Asian,Attrition,Attrition_Denom,Black,DAEP,Dyslexia,Econ_Disadv,Female,Foster_Care,Hispanic,Homeless,Immigrant,Am_Ind,Eng_Learner,Male,Migrant,Military_Conn,Non_Ed_Disadv,Pacific_Is,At_Risk,Title_I,Two_Or_More,White,DISTNAME,DISTRICT,_merge
0,'101912001,AUSTIN H S,"1700 Dumble St, Houston, Texas, 77023",'09-12,-95.331465,29.730194,Austin HS,2022-01-11 15:30:00,'101912001,AUSTIN H S,45.0,1510.0,1.0,212,1299,123.0,41.0,32.0,1447.0,678.0,0.0,1374.0,26.0,49.0,2.0,494.0,832.0,1.0,0.0,63.0,1.0,1175.0,1510.0,2.0,7.0,HOUSTON ISD,'101912,both
1,'101912002,BELLAIRE H S,"5100 Maple St, Bellaire, Texas, 77401",'09-12,-95.469016,29.691198,Bellaire HS,2022-01-11 16:10:00,'101912002,BELLAIRE H S,226.0,3213.0,402.0,351,2716,672.0,71.0,122.0,1436.0,1641.0,0.0,1320.0,6.0,89.0,9.0,436.0,1572.0,0.0,0.0,1777.0,5.0,1246.0,3213.0,74.0,731.0,HOUSTON ISD,'101912,both
2,'101912003,NORTHSIDE H S,"1101 Quitman St, Houston, Texas, 77009",'09-12,-95.357984,29.782934,Northside HS,2022-01-11 16:10:00,'101912003,NORTHSIDE H S,45.0,1431.0,0.0,162,1165,212.0,16.0,42.0,1337.0,691.0,0.0,1204.0,5.0,10.0,3.0,358.0,740.0,0.0,0.0,94.0,0.0,896.0,1431.0,2.0,10.0,HOUSTON ISD,'101912,both
3,'101912004,FURR H S,"520 Mercury Dr, Houston, Texas, 77013",'09-12,-95.246987,29.778996,Furr HS,2022-01-11 16:10:00,'101912004,FURR H S,25.0,1110.0,5.0,174,922,200.0,37.0,34.0,1068.0,541.0,5.0,883.0,27.0,90.0,3.0,310.0,569.0,2.0,0.0,42.0,0.0,691.0,1110.0,4.0,15.0,HOUSTON ISD,'101912,both
4,'101912006,JONES FUTURES ACADEMY,"7414 St Lo Rd, Houston, Texas, 77033",'09-12,-95.339505,29.674387,Jones HS,2022-01-11 15:30:00,'101912006,JONES FUTURES ACADEMY,9.0,341.0,0.0,34,258,131.0,1.0,4.0,317.0,216.0,0.0,203.0,4.0,9.0,1.0,61.0,125.0,0.0,0.0,24.0,0.0,190.0,341.0,2.0,4.0,HOUSTON ISD,'101912,both


## deleting all relevant schools/edits to the data 

The geography information is from 2021-2022, and the demographic information is from 2020-2021, which is why there might be some discrepancies

In [42]:
# merging geographic information with the key 
stud_1[stud_1._merge != "both"]

Unnamed: 0,School_Num,School_Nam,Place_addr,Grade_Rang,X,Y,Campus Short Name,_merge
258,'101912459,ENERGIZED FOR STEM ACADEMY SOUTHEAST MIDDLE,"7055 Beechnut St, Houston, Texas, 77074",'06-08,-95.507499,29.691872,,left_only


In [43]:
# merging in school end time information 
stud_2[stud_2._merge != "both"]

Unnamed: 0,School_Num,School_Nam,Place_addr,Grade_Rang,X,Y,Campus Short Name,End Time,_merge
51,'101912069,SOAR CTR,"4400 W 18th St, Houston, Texas, 77092",'EE-12,-95.455931,29.803891,SOAR Center,NaT,left_only
258,'101912459,ENERGIZED FOR STEM ACADEMY SOUTHEAST MIDDLE,"7055 Beechnut St, Houston, Texas, 77074",'06-08,-95.507499,29.691872,,NaT,left_only


In [44]:
# merging in demographic data 
stud_final[stud_final._merge != "both"]

Unnamed: 0,School_Num,School_Nam,Place_addr,Grade_Rang,X,Y,Campus Short Name,End Time,CAMPUS,CAMPNAME,D504,All,Asian,Attrition,Attrition_Denom,Black,DAEP,Dyslexia,Econ_Disadv,Female,Foster_Care,Hispanic,Homeless,Immigrant,Am_Ind,Eng_Learner,Male,Migrant,Military_Conn,Non_Ed_Disadv,Pacific_Is,At_Risk,Title_I,Two_Or_More,White,DISTNAME,DISTRICT,_merge
219,'101912320,HARRIS CO J J A E P,"2525 Murworth Dr, Houston, Texas, 77054",'04-12,-95.417389,29.685171,JJAEP,2022-01-11 16:10:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
258,'101912459,ENERGIZED FOR STEM ACADEMY SOUTHEAST MIDDLE,"7055 Beechnut St, Houston, Texas, 77074",'06-08,-95.507499,29.691872,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
261,'101912466,EL DAEP,"6411 Laredo St, Houston, Texas, 77020",'KG-05,-95.306528,29.780862,DAEP EL,2022-01-11 14:50:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only
276,,,,,,,,NaT,'101912371,YOUNG SCHOLARS ACADEMY FOR EXCELLE,1.0,107.0,0.0,49.0,123.0,102.0,0.0,3.0,98.0,58.0,0.0,4.0,0.0,0.0,0.0,0.0,49.0,0.0,0.0,9.0,0.0,40.0,107.0,0.0,1.0,HOUSTON ISD,'101912,right_only
277,,,,,,,,NaT,'101912392,YOUNG LEARNERS,1.0,481.0,2.0,0.0,258.0,129.0,0.0,0.0,477.0,237.0,0.0,334.0,0.0,12.0,1.0,209.0,244.0,2.0,0.0,4.0,1.0,332.0,481.0,5.0,9.0,HOUSTON ISD,'101912,right_only


In [53]:
# delete ENERGIZED FOR STEM ACADEMY SOUTHEAST MIDDLE because not in the HISD list and no demographic information available 
stud_final = stud_final[stud_final.School_Nam != "ENERGIZED FOR STEM ACADEMY SOUTHEAST MIDDLE"]

# delete SOAR Academy since there is no school end time 
stud_final = stud_final[stud_final['Campus Short Name'] != "SOAR Center"]

# delete JJAEP and DAEP because no demographic information (also alternative campuses) 
stud_final = stud_final[~stud_final.School_Nam.isin(["HARRIS CO J J A E P", "EL DAEP"])]

# Young Scholars Academy and Young Learners have been shut down/are a pre-K chain respectively (https://www.houstonisd.org/Page/32496), so we'll delete them 
stud_final = stud_final[~stud_final.CAMPNAME.isin(["YOUNG SCHOLARS ACADEMY FOR EXCELLE", "YOUNG LEARNERS"])]

In [54]:
stud_final.head()


Unnamed: 0,School_Num,School_Nam,Place_addr,Grade_Rang,X,Y,Campus Short Name,End Time,CAMPUS,CAMPNAME,D504,All,Asian,Attrition,Attrition_Denom,Black,DAEP,Dyslexia,Econ_Disadv,Female,Foster_Care,Hispanic,Homeless,Immigrant,Am_Ind,Eng_Learner,Male,Migrant,Military_Conn,Non_Ed_Disadv,Pacific_Is,At_Risk,Title_I,Two_Or_More,White,DISTNAME,DISTRICT,_merge
0,'101912001,AUSTIN H S,"1700 Dumble St, Houston, Texas, 77023",'09-12,-95.331465,29.730194,Austin HS,2022-01-11 15:30:00,'101912001,AUSTIN H S,45.0,1510.0,1.0,212,1299,123.0,41.0,32.0,1447.0,678.0,0.0,1374.0,26.0,49.0,2.0,494.0,832.0,1.0,0.0,63.0,1.0,1175.0,1510.0,2.0,7.0,HOUSTON ISD,'101912,both
1,'101912002,BELLAIRE H S,"5100 Maple St, Bellaire, Texas, 77401",'09-12,-95.469016,29.691198,Bellaire HS,2022-01-11 16:10:00,'101912002,BELLAIRE H S,226.0,3213.0,402.0,351,2716,672.0,71.0,122.0,1436.0,1641.0,0.0,1320.0,6.0,89.0,9.0,436.0,1572.0,0.0,0.0,1777.0,5.0,1246.0,3213.0,74.0,731.0,HOUSTON ISD,'101912,both
2,'101912003,NORTHSIDE H S,"1101 Quitman St, Houston, Texas, 77009",'09-12,-95.357984,29.782934,Northside HS,2022-01-11 16:10:00,'101912003,NORTHSIDE H S,45.0,1431.0,0.0,162,1165,212.0,16.0,42.0,1337.0,691.0,0.0,1204.0,5.0,10.0,3.0,358.0,740.0,0.0,0.0,94.0,0.0,896.0,1431.0,2.0,10.0,HOUSTON ISD,'101912,both
3,'101912004,FURR H S,"520 Mercury Dr, Houston, Texas, 77013",'09-12,-95.246987,29.778996,Furr HS,2022-01-11 16:10:00,'101912004,FURR H S,25.0,1110.0,5.0,174,922,200.0,37.0,34.0,1068.0,541.0,5.0,883.0,27.0,90.0,3.0,310.0,569.0,2.0,0.0,42.0,0.0,691.0,1110.0,4.0,15.0,HOUSTON ISD,'101912,both
4,'101912006,JONES FUTURES ACADEMY,"7414 St Lo Rd, Houston, Texas, 77033",'09-12,-95.339505,29.674387,Jones HS,2022-01-11 15:30:00,'101912006,JONES FUTURES ACADEMY,9.0,341.0,0.0,34,258,131.0,1.0,4.0,317.0,216.0,0.0,203.0,4.0,9.0,1.0,61.0,125.0,0.0,0.0,24.0,0.0,190.0,341.0,2.0,4.0,HOUSTON ISD,'101912,both


now, look at script 1_Cleaning_Sch.py (in the code folder) to see this file being saved