In [1]:
# Set up imports
import pandas as pd
import numpy as np
import scipy.stats as stats
import sqlalchemy as sql
import matplotlib.pyplot as plt
import sqlalchemy.dialects.sqlite as sqlite

In [2]:
categories_dict = {'GENDER': ['gender', 'demographic'], \
                   'ELL': ['language', 'demographic'], \
                   'D_CODE': ['d-code', 'discipline'], \
                   'IEP': ['disability', 'demographic'], \
                   'LENGTH_IN_DAYS': ['removal time', 'discipline'], \
                   'RACE': ['race', 'demographic'], \
                   'Students_In_Temporary_Housing': ['housing', 'demographic'] \
                  }

discipline_dict = {'removals':'r', \
                   'r': 'r', \
                   'principal':'p', \
                   'p': 'p', \
                   'superintendent': 's', \
                   's': 's', \
                   'expulsions':'e', \
                   'e': 'e', \
                   'removals/suspensions':'all'}



def general_clean(file, category, df):
    rawdf = pd.read_csv('res/2016-2017_Student_Discipline_Annual_Report_-_' + file + '.csv')
    rawdf.columns = rawdf.columns.map(lambda x: x.lower())
    
    rawdf.drop(['location name', 'location category', 'administrative district'], axis=1, inplace=True)
    if 'sy1617 total removals/suspensions' in rawdf.columns:
        rawdf.drop(['sy1617 total removals/suspensions'], axis=1, inplace=True)
 

    normdf = pd.melt(rawdf, id_vars=['dbn'])
    
    normdf.rename(columns={'value':'number'}, inplace=True)
    
    normdf['variable'] = normdf['variable'].map(lambda x: x.split(' '))
    
    normdf['discipline_type'] = [v[-1] for v in normdf['variable']] 
    normdf['discipline_type'] = normdf['discipline_type'].map(discipline_dict)
    
    if file == 'D_CODE':
        normdf['var_name'] = [v[0] for v in normdf['variable']]
    else:
        normdf['var_name'] = [' '.join(v[:-1]) for v in normdf['variable']]
    
    
    
    normdf.drop('variable', axis=1, inplace=True)
    
    normdf['var_category'] = [category[0] for i in normdf.index]
    normdf['var_type'] = [category[1] for i in normdf.index]
    
    return df.append(normdf, ignore_index=True)

In [None]:
#### Start with RPS Total so yearly total columns can be dropped from other tables

total_raw = pd.read_csv('res/2016-2017_Student_Discipline_Annual_Report_-_RPS_TOTALS.csv')

total_raw.columns = total_raw.columns.map(lambda x: x.lower())
total_raw.drop(columns=['location name', 'location category', 'administrative district'], axis=1, inplace=True)

suspensionsdf = pd.melt(total_raw, id_vars=['dbn'])

suspensionsdf.rename(columns={'variable':'var_name','value':'number'}, inplace=True)

suspensionsdf['var_name'] = suspensionsdf['var_name'].map(lambda x: 'total discipline' if x == 'sy1617 total removals/suspensions' else x)

discipline_dict_2 = {'removals':'r', 'principal':'p', 'superintendent': 's', 'expulsions':'e', 'total discipline':'all discipline types'}
suspensionsdf['discipline_type'] = suspensionsdf['var_name'].map(discipline_dict_2)

suspensionsdf['var_category'] = ['discipline type' for v in suspensionsdf['var_name']]
suspensionsdf['var_type'] = ['discipline' for v in suspensionsdf['var_name']]

suspensionsdf = suspensionsdf[['dbn', 'number', 'discipline_type', 'var_name', 'var_category', 'var_type']]

suspensionsdf.head()

In [None]:
for file, category in categories_dict.items():
    suspensionsdf = general_clean(file, category, suspensionsdf)
    print(file)

print(len(suspensionsdf))

In [None]:
suspensionsdf['year'] = [2016 for r in suspensionsdf.index]

suspensionsdf['number'] = suspensionsdf['number'].map(lambda x: np.nan if x == 'R' else np.int64(x))
suspensionsdf['var_name'] = suspensionsdf['var_name'].map(lambda x: 'non-swd' if x=='gen ed' else x)
suspensionsdf['removal_id'] = [i for i in suspensionsdf.index]

suspensionsdf = suspensionsdf[['removal_id', 'dbn', 'year', 'number', 'discipline_type', 'var_name', 'var_category', 'var_type']]

suspensionsdf.head()

In [None]:
conn = sql.create_engine('sqlite:///db/nycedudata.db')
meta = sql.MetaData()

schools_table = sql.Table('Schools',meta,autoload=True,autoload_with=conn)

conn.execute('DROP TABLE IF EXISTS Removals;')

suspensions_table = sql.Table('Removals', meta,\
                sql.Column('removal_id', sqlite.INTEGER, primary_key=True),\
                sql.Column('dbn', sqlite.TEXT,sql.ForeignKey('Schools.dbn',onupdate='CASCADE',ondelete='SET NULL')),\
                sql.Column('year', sqlite.INTEGER),\
                sql.Column('number', sqlite.INTEGER),\
                sql.Column('discipline_type', sqlite.TEXT),\
                sql.Column('var_name', sqlite.TEXT),\
                sql.Column('var_category',sqlite.TEXT),\
                sql.Column('var_type',sqlite.TEXT),\
                sqlite_autoincrement=True)

suspensions_table.create(conn)


In [None]:
values = suspensionsdf.to_dict(orient='records')

print(values[0])

In [None]:
conn.execute(suspensions_table.insert(),values)

In [None]:
print(conn.execute('SELECT * FROM Removals LIMIT 5;').fetchall())