In [1]:
# Set up imports
import pandas as pd
import numpy as np
import scipy.stats as stats
import sqlalchemy as sql
import matplotlib.pyplot as plt

In [2]:
categories_dict = {'GENDER': ['gender', 'demographic'], \
                   'ELL': ['language', 'demographic'], \
                   'D_CODE': ['d-code', 'discipline'], \
                   'IEP': ['disability', 'demographic'], \
                   'LENGTH_IN_DAYS': ['removal time', 'discipline'], \
                   'RACE': ['race', 'demographic'], \
                   'Students_In_Temporary_Housing': ['housing', 'demographic'] \
                  }

discipline_dict = {'removals':'r', \
                   'r': 'r', \
                   'principal':'p', \
                   'p': 'p', \
                   'superintendent': 's', \
                   's': 's', \
                   'expulsions':'e', \
                   'e': 'e', \
                   'removals/suspensions':'all'}



def general_clean(file, category, df):
    rawdf = pd.read_csv('res/2016-2017_Student_Discipline_Annual_Report_-_' + file + '.csv')
    rawdf.columns = rawdf.columns.map(lambda x: x.lower())
    
    rawdf.drop(['location name', 'location category', 'administrative district'], axis=1, inplace=True)
    if 'sy1617 total removals/suspensions' in rawdf.columns:
        rawdf.drop(['sy1617 total removals/suspensions'], axis=1, inplace=True)
 

    normdf = pd.melt(rawdf, id_vars=['dbn'])
    
    normdf.rename(columns={'value':'number'}, inplace=True)
    
    normdf['variable'] = normdf['variable'].map(lambda x: x.split(' '))
    
    normdf['discipline_type'] = [v[-1] for v in normdf['variable']] 
    normdf['discipline_type'] = normdf['discipline_type'].map(discipline_dict)
    
    if file == 'D_CODE':
        normdf['var_name'] = [v[0] for v in normdf['variable']]
    else:
        normdf['var_name'] = [' '.join(v[:-1]) for v in normdf['variable']]
    
    
    
    normdf.drop('variable', axis=1, inplace=True)
    
    normdf['var_cat'] = [category[0] for i in normdf.index]
    normdf['var_type'] = [category[1] for i in normdf.index]
    
    return df.append(normdf, ignore_index=True)

In [3]:
#### Start with RPS Total so yearly total columns can be dropped from other tables

total_raw = pd.read_csv('res/2016-2017_Student_Discipline_Annual_Report_-_RPS_TOTALS.csv')

total_raw.columns = total_raw.columns.map(lambda x: x.lower())
total_raw.drop(columns=['location name', 'location category', 'administrative district'], axis=1, inplace=True)

suspensionsdf = pd.melt(total_raw, id_vars=['dbn'])

suspensionsdf.rename(columns={'variable':'var_name','value':'number'}, inplace=True)

suspensionsdf['var_name'] = suspensionsdf['var_name'].map(lambda x: 'total discipline' if x == 'sy1617 total removals/suspensions' else x)

discipline_dict_2 = {'removals':'r', 'principal':'p', 'superintendent': 's', 'expulsions':'e', 'total discipline':'all discipline types'}
suspensionsdf['discipline_type'] = suspensionsdf['var_name'].map(discipline_dict_2)

suspensionsdf['var_cat'] = ['discipline type' for v in suspensionsdf['var_name']]
suspensionsdf['var_type'] = ['discipline' for v in suspensionsdf['var_name']]

suspensionsdf = suspensionsdf[['dbn', 'number', 'discipline_type', 'var_name', 'var_cat', 'var_type']]

suspensionsdf.head()

Unnamed: 0,dbn,number,discipline_type,var_name,var_cat,var_type
0,01M015,0,r,removals,discipline type,discipline
1,01M019,0,r,removals,discipline type,discipline
2,01M020,R,r,removals,discipline type,discipline
3,01M034,R,r,removals,discipline type,discipline
4,01M063,23,r,removals,discipline type,discipline


In [4]:
for file, category in categories_dict.items():
    suspensionsdf = general_clean(file, category, suspensionsdf)
    print(file)

print(len(suspensionsdf))

GENDER
ELL
D_CODE
IEP
LENGTH_IN_DAYS
RACE
Students_In_Temporary_Housing
526140


In [5]:
suspensionsdf['year'] = [2016 for r in suspensionsdf.index]

# suspensionsdf['number'] = suspensionsdf['number'].map(lambda x: np.nan if x == 'R' else np.int64(x))
suspensionsdf['var_name'] = suspensionsdf['var_name'].map(lambda x: 'non-swd' if x=='gen ed' else x)


In [None]:
# suspensionsdf.loc[suspensionsdf['dbn'] == '05M285'].loc[suspensionsdf['var_cat']=='disability']
# print(set(zip(suspensionsdf['var_name'], suspensionsdf['var_cat'], suspensionsdf['var_type'])))
tup = (20,) + tuple(suspensionsdf.loc[20])
print(tup)

In [6]:
conn = sql.create_engine('sqlite:///db/nycedudata.db')

conn.execute('DROP TABLE IF EXISTS Removals;')

demographics_table = sa.Table('Demographics', meta,\
                sa.Column('demo_id', sqlite.INTEGER, primary_key=True),\
                sa.Column('dbn', sqlite.TEXT,sa.ForeignKey('Schools.dbn',onupdate='CASCADE',ondelete='SET NULL')),\
                sa.Column('year', sqlite.INTEGER,sa.ForeignKey('Years.year',onupdate='CASCADE',ondelete='SET NULL')),\
                sa.Column('total_enrollment',sqlite.TEXT),\
                sa.Column('demo_cat',sqlite.TEXT),\
                sa.Column('demo_var',sqlite.TEXT),\
                sa.Column('demo_num',sqlite.INTEGER),\
                sa.Column('demo_pct',sqlite.REAL),\
                sa.ForeignKeyConstraint(['demo_cat','demo_var'],['Demo_Categories.demo_cat','Demo_Categories.demo_var'],onupdate='CASCADE',ondelete='SET NULL'),\
                sqlite_autoincrement=True)

for i in suspensionsdf.index:
    values = (i,) + tuple(suspensionsdf.loc[i])
    conn.execute('INSERT INTO Removals VALUES (?, ?,?,?,?,?,?,?);',values)


test = conn.execute('SELECT * FROM Removal LIMIT 5;').fetchall()
print(test)

KeyboardInterrupt: 

In [7]:
i

14659