In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier

# Box Plots for Education

Problem description [here](https://www.drivendata.org/competitions/46/box-plots-for-education-reboot/page/86/).

**Note:** This is a [multiclass-multioutput](https://scikit-learn.org/stable/modules/multiclass.html) problem.

In [2]:
!wget https://s3.amazonaws.com/drivendata/data/4/public/da1dd36a-a497-42c7-b3f3-4a225944bdba.zip -nc -P ./education/

File ‘./education/da1dd36a-a497-42c7-b3f3-4a225944bdba.zip’ already there; not retrieving.



In [3]:
df = pd.read_csv('education/TrainingData.csv', index_col=0)
df.head()

Unnamed: 0,Function,Use,Sharing,Reporting,Student_Type,Position_Type,Object_Type,Pre_K,Operating_Status,Object_Description,...,Sub_Object_Description,Location_Description,FTE,Function_Description,Facility_or_Department,Position_Extra,Total,Program_Description,Fund_Description,Text_1
134338,Teacher Compensation,Instruction,School Reported,School,NO_LABEL,Teacher,NO_LABEL,NO_LABEL,PreK-12 Operating,,...,,,1.0,,,KINDERGARTEN,50471.81,KINDERGARTEN,General Fund,
206341,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,NO_LABEL,Non-Operating,CONTRACTOR SERVICES,...,,,,RGN GOB,,UNDESIGNATED,3477.86,BUILDING IMPROVEMENT SERVICES,,BUILDING IMPROVEMENT SERVICES
326408,Teacher Compensation,Instruction,School Reported,School,Unspecified,Teacher,Base Salary/Compensation,Non PreK,PreK-12 Operating,Personal Services - Teachers,...,,,1.0,,,TEACHER,62237.13,Instruction - Regular,General Purpose School,
364634,Substitute Compensation,Instruction,School Reported,School,Unspecified,Substitute,Benefits,NO_LABEL,PreK-12 Operating,EMPLOYEE BENEFITS,...,,,,UNALLOC BUDGETS/SCHOOLS,,PROFESSIONAL-INSTRUCTIONAL,22.3,GENERAL MIDDLE/JUNIOR HIGH SCH,,REGULAR INSTRUCTION
47683,Substitute Compensation,Instruction,School Reported,School,Unspecified,Teacher,Substitute Compensation,NO_LABEL,PreK-12 Operating,TEACHER COVERAGE FOR TEACHER,...,,,,NON-PROJECT,,PROFESSIONAL-INSTRUCTIONAL,54.166,GENERAL HIGH SCHOOL EDUCATION,,REGULAR INSTRUCTION


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400277 entries, 134338 to 415831
Data columns (total 25 columns):
Function                  400277 non-null object
Use                       400277 non-null object
Sharing                   400277 non-null object
Reporting                 400277 non-null object
Student_Type              400277 non-null object
Position_Type             400277 non-null object
Object_Type               400277 non-null object
Pre_K                     400277 non-null object
Operating_Status          400277 non-null object
Object_Description        375493 non-null object
Text_2                    88217 non-null object
SubFund_Description       306855 non-null object
Job_Title_Description     292743 non-null object
Text_3                    109152 non-null object
Text_4                    53746 non-null object
Sub_Object_Description    91603 non-null object
Location_Description      162054 non-null object
FTE                       126071 non-null float64
Func

# Train Test Split

In [5]:
label_cols = ['Function',
              'Object_Type', 
              'Operating_Status',
              'Position_Type',
              'Pre_K',
              'Reporting',
              'Sharing',
              'Student_Type',
              'Use']

X = df.drop(label_cols, axis=1)
y = df[label_cols]

# Attempt #1: Numerical Features Only

In [6]:
num_features = [col for col in df.columns if df[col].dtype != 'object']

In [8]:
ct = ColumnTransformer(remainder='drop',
                       transformers=[
                           ('imputer', SimpleImputer(strategy='median'), num_features)
                       ])

one_feat_model = Pipeline([
    ('transformer', ct),
    ('classifier', DecisionTreeClassifier())
])

In [None]:
one_feat_model.fit(X,y)