In [1]:
from plotly import __version__
import cufflinks as cf
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb

In [3]:
import pandas as pd
import glob, os 
import numpy as np

os.chdir('csv_files')
results = pd.DataFrame([])

In [4]:
#Merged all csv files under the csv_file folder
for counter, file in enumerate(glob.glob('Stud*')):
    namedf = pd.read_csv(file, skiprows=6)
    results = results.append(namedf)
results.to_csv('../csv_files/combined.csv')
results = results.reset_index(drop=True)


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [5]:
#Removed any missing district name data, mainly all empty rows that 
#remained from the footer columns
df = results[results['DISTRICT NAME'].notnull()].reset_index(drop=True)
df.head(3)

Unnamed: 0,AGG_LEVEL,CHARTER STATUS,DISTRICT NAME,DISTRICT NUMBER,REGION,SCOPE,TOTAL AT RISK STUDENTS,TOTAL BILINGUAL STUDENTS,TOTAL CTE STUDENTS,TOTAL DYSLEXIC STUDENTS,...,TOTAL FOSTER CARE STUDENTS,TOTAL G & T STUDENTS,TOTAL HOMELESS STUDENTS,TOTAL LEP STUDENTS,TOTAL MILITARY CONNECTED STUDENTS,TOTAL PRE-K FOSTER STUDENTS,TOTAL PRE-K MILITARY STUDENTS,TOTAL STUDENTS RECEIVING SPECIAL EDUCATION SERVICES,TOTAL TITLE I STUDENTS,YEAR
0,DISTRICT,,CAYUGA ISD,1902.0,7.0,STATE,224.0,0.0,153.0,37.0,...,-9999999.0,63.0,,-9999999.0,5.0,,,80.0,570.0,2015-2016
1,DISTRICT,,ELKHART ISD,1903.0,7.0,STATE,380.0,0.0,304.0,40.0,...,14.0,45.0,,19.0,0.0,,,144.0,1244.0,2015-2016
2,DISTRICT,,FRANKSTON ISD,1904.0,7.0,STATE,338.0,0.0,273.0,39.0,...,11.0,60.0,,19.0,16.0,,,70.0,841.0,2015-2016


In [6]:
#Obtains the average of masked values based on 1 percent of total enrollment
t1_missing_val = df[df['TOTAL TITLE I STUDENTS']<0]['TOTAL ENROLLMENT'].mean()*0.01
#This grabs the working dataframe columns from the orignal df, and will replace them with the t1_missing_val mean
wk_df = df[['YEAR','DISTRICT NAME','DISTRICT NUMBER','REGION','TOTAL ENROLLMENT','TOTAL TITLE I STUDENTS']]
wk_df.replace(-9999999,t1_missing_val)
wk_df['PCT_T1'] = (wk_df['TOTAL TITLE I STUDENTS'] / wk_df['TOTAL ENROLLMENT']) * 100
wk_df = wk_df.round()

In [7]:
def yr_enrollment(limit, year='2018-2019'):
    '''Sort district ISDs by number of enrollment and academic year.
    limit: takes in an integer or float
    year: takes in a string of academic year'''
    try:
        data = wk_df[(wk_df['TOTAL ENROLLMENT'] >= limit) & 
        (wk_df['YEAR'] == year)][['DISTRICT NUMBER','DISTRICT NAME','TOTAL ENROLLMENT','TOTAL TITLE I STUDENTS']]
        data = data.sort_values('TOTAL ENROLLMENT', ascending=False).reset_index(drop=True)
        data['PCT_T1'] = (data['TOTAL TITLE I STUDENTS'] / data['TOTAL ENROLLMENT']) * 100
        return data.round()
    except Exception as e:
        return print('Please read the docsting notes')

In [37]:
tx_enroll = df[df['YEAR']=='2018-2019']['TOTAL ENROLLMENT'].sum()
tx_t1enroll = df[df['YEAR']=='2018-2019']['TOTAL TITLE I STUDENTS'].sum()
pct_t1 = ((tx_t1enroll/tx_enroll) * 100).round()

In [39]:
print(f'Texas total current enrollment: {tx_enroll}.')
print(f'Texas total current title 1 enrollment: {tx_t1enroll}.')
print(f'Texas total title 1 percent enrollment: {pct_t1} percent.')

Texas total current enrollment: 5431910.0.
Texas total current title 1 enrollment: 3524974.0.
Texas total title 1 percent enrollment: 65.0 percent.


In [41]:
yr_enrollment(65000)

Unnamed: 0,DISTRICT NUMBER,DISTRICT NAME,TOTAL ENROLLMENT,TOTAL TITLE I STUDENTS,PCT_T1
0,101912.0,HOUSTON ISD,209772.0,191412.0,91.0
1,57905.0,DALLAS ISD,155119.0,150628.0,97.0
2,101907.0,CYPRESS-FAIRBANKS ISD,116512.0,51658.0,44.0
3,15915.0,NORTHSIDE ISD,106501.0,37151.0,35.0
4,220905.0,FORT WORTH ISD,84510.0,81088.0,96.0
5,227901.0,AUSTIN ISD,80032.0,40203.0,50.0
6,101914.0,KATY ISD,79913.0,14254.0,18.0
7,79907.0,FORT BEND ISD,76122.0,15198.0,20.0
8,101902.0,ALDINE ISD,66854.0,66714.0,100.0
9,15910.0,NORTH EAST ISD,65186.0,15908.0,24.0


In [17]:
#names of all ISD's surrounding Austin area.
aus_area_schools = ['AUSTIN ISD','DEL VALLE ISD','DRIPPING SPRINGS ISD',
 'EANES ISD','GEORGETOWN ISD','HAYS CISD','HUTTO ISD','LAKE TRAVIS ISD',
 'LEANDER ISD','PFLUGERVILLE ISD','ROUND ROCK ISD','MANOR ISD','LAGO VISTA ISD']

In [18]:
aus_area_df = wk_df[wk_df['DISTRICT NAME'].isin(aus_area_schools)].reset_index(drop=True)
aus_area_df.head(3)

Unnamed: 0,YEAR,DISTRICT NAME,DISTRICT NUMBER,REGION,TOTAL ENROLLMENT,TOTAL TITLE I STUDENTS,PCT_T1
0,2015-2016,DRIPPING SPRINGS ISD,105904.0,13.0,5619.0,1822.0,32.0
1,2015-2016,HAYS CISD,105906.0,13.0,18654.0,10392.0,56.0
2,2015-2016,AUSTIN ISD,227901.0,13.0,83648.0,45120.0,54.0


In [19]:
wk_plot = aus_area_df.groupby(["DISTRICT NAME", "YEAR"])['TOTAL ENROLLMENT'].mean().reset_index()

In [20]:
#Plot enrollments of all Austin area I
wk_pivot = wk_plot.pivot(index='DISTRICT NAME', columns='YEAR')
cf.set_config_file(offline=True, world_readable=False, theme='ggplot')
wk_pivot.iplot(kind='bar',title='Autin Area ISDs')


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [21]:
t1_plot = aus_area_df.groupby(["DISTRICT NAME", "YEAR"])['TOTAL TITLE I STUDENTS'].mean().reset_index()

In [22]:
t1_pivot = t1_plot.pivot(index='DISTRICT NAME', columns='YEAR')
cf.set_config_file(offline=True, world_readable=False, theme='ggplot')
t1_pivot.iplot(kind='bar',title='Autin Area ISDs Title 1 Status')

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [23]:
t1_pct_plot = aus_area_df.groupby(["DISTRICT NAME", "YEAR"])['PCT_T1'].mean().reset_index()

In [24]:
t1_pct_pivot = t1_pct_plot.pivot(index='DISTRICT NAME', columns='YEAR')
cf.set_config_file(offline=True, world_readable=False, theme='ggplot')
t1_pct_pivot.iplot(kind='bar',title='Autin Area ISDs Title 1 Pecent Status')

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
