#### This data comes from the City of Chicago and is for all public schools in Chicago. Your task is to classify schools into probationary status (probation = 1) and non-probationary status (probation = 0). 

What are my pipeline steps:

1) import all my libraries I need

2) import train and test data

3) operationalize y (y = "probation")

4) write all my definitions

5) clean/transform data column by column for x
    (make sure each column has a deal with nan function)
    
6) create x with make_union

7) fit train to a model (try KNN and LogReg--remember LogReg needs one dummy dropped from each set of dummies)

8) score and crossvalidate model

9) if good, predict y hat with test data

10) export predictions to csv to upload to Kaggle

Note: when I am trying to figure out a good model I can use the train/test split on the training data to check it and also the cross validation techniques and other tests like GridSearchCV (to get the best_estimator_ figures). But those won't go in the final model.

In [1]:
#1) import stuff (can prob get rid of seaborn and matplotlib for final v)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, Imputer, FunctionTransformer, LabelBinarizer
from sklearn.pipeline import make_pipeline, make_union
%matplotlib inline

In [2]:
#set view stuff (only needed for draft in Jupyter notebook)
pd.set_option('display.max_columns', 100)
pd.set_option("display.max_rows", 100)

In [3]:
#2) import train and test data
school_train = pd.read_csv('school_data_training.csv')
school_test = pd.read_csv('school_data_test.csv')

In [5]:
#3) operationalize y (y = "probation")
target_train = school_train["probation"]

In [8]:
#4) write all my definitions
#extract column
def col_extractor(df, col_name="column_name"):
    return df[col_name]
#I'm going to get back to this...I think it should work if I do some sort of nesting


In [21]:
def avg_student_col(df):
    return df["Average Student Attendance"]

In [31]:
def healthy_schools_col(df):
    return df["Healthy Schools Certified?"]

In [32]:
def misconduct_col(df):
    return df["Rate of Misconducts (per 100 students) "]

In [33]:
def avg_teacher_col(df):
    return df["Average Teacher Attendance"]

In [34]:
def iep_col(df):
    return df["Individualized Education Program Compliance Rate "]

In [35]:
def ward_col(df):
    return df["Ward"]

In [36]:
def police_d_col(df):
    return df["Police District"]

In [22]:
#strip percentage signs
def strip_percent(column):
    column = column.str.replace("%", "")
    column = column.apply(float)
    return column

In [26]:
def return_array(column):
    return column.values.reshape(-1, 1)

In [None]:
#5) make pipelines to clean/transform data column by column for x 
#(in the real world make sure each column has a deal with nan function
#but not necessary here)

In [28]:
avg_stud_pipe = make_pipeline(FunctionTransformer(avg_student_col, validate=False),
                         FunctionTransformer(strip_percent, validate=False),
                             FunctionTransformer(return_array, validate=False))

In [29]:
avg_stud_pipe

Pipeline(steps=[('functiontransformer-1', FunctionTransformer(accept_sparse=False,
          func=<function avg_student_col at 0x111283048>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=False, validate=False)), ('functiontransformer-2', FunctionTransformer(accept_sparse=False,
        ...54950>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=False, validate=False))])

In [37]:
avg_stud_pipe.fit_transform(school_train)[0:5]

array([[ 96.2],
       [ 96.6],
       [ 91.3],
       [ 84.8],
       [ 91. ]])

In [38]:
avg_teach_pipe = make_pipeline(FunctionTransformer(avg_teacher_col, validate=False),
                         FunctionTransformer(strip_percent, validate=False),
                         FunctionTransformer(return_array, validate=False))

In [41]:
miscon_pipe = make_pipeline(FunctionTransformer(misconduct_col, validate=False),
                         FunctionTransformer(return_array, validate=False))

In [48]:
iep_pipe = make_pipeline(FunctionTransformer(iep_col, validate=False),
                         FunctionTransformer(strip_percent, validate=False),
                         FunctionTransformer(return_array, validate=False))

In [43]:
police_pipe = make_pipeline(FunctionTransformer(police_d_col, validate=False),
                         FunctionTransformer(return_array, validate=False),
                         LabelBinarizer())

In [45]:
ward_pipe = make_pipeline(FunctionTransformer(ward_col, validate=False),
                         FunctionTransformer(return_array, validate=False),
                         LabelBinarizer())

In [46]:
ward_pipe.fit_transform(school_train)[0:5]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0]])

In [49]:
#6) create x with make_union
union = make_union(avg_stud_pipe, avg_teach_pipe, iep_pipe, miscon_pipe, police_pipe, ward_pipe)

In [50]:
union.fit_transform(school_train)

array([[  96.2,   97.4,   99. , ...,    0. ,    0. ,    0. ],
       [  96.6,   96.3,  100. , ...,    0. ,    0. ,    0. ],
       [  91.3,   95. ,  100. , ...,    0. ,    0. ,    0. ],
       ..., 
       [  93.3,   94.1,  100. , ...,    0. ,    0. ,    0. ],
       [  96.5,   96.3,  100. , ...,    0. ,    0. ,    0. ],
       [  92. ,   94.4,  100. , ...,    0. ,    0. ,    0. ]])

In [65]:
#then add standard scaler for knn
final_pipe = make_pipeline(union, StandardScaler())
final_pipe.fit_transform(school_train)

array([[ 0.65150232,  0.28657462,  0.03859543, ..., -0.09877296,
        -0.09877296, -0.11056645],
       [ 0.71986424,  0.15215699,  0.46928295, ..., -0.09877296,
        -0.09877296, -0.11056645],
       [-0.18593122, -0.00670022,  0.46928295, ..., -0.09877296,
        -0.09877296, -0.11056645],
       ..., 
       [ 0.15587839, -0.11667829,  0.46928295, ..., -0.09877296,
        -0.09877296, -0.11056645],
       [ 0.70277376,  0.15215699,  0.46928295, ..., -0.09877296,
        -0.09877296, -0.11056645],
       [-0.06629785, -0.08001893,  0.46928295, ..., -0.09877296,
        -0.09877296, -0.11056645]])

In [66]:
#7) fit train to a model 
#(try KNN and LogReg--remember LogReg needs one dummy 
#dropped from each set of dummies)
knn = KNeighborsClassifier()
knn.fit(final_pipe.fit_transform(school_train), target_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [None]:
#8) score and crossvalidate model
#Note: this is not part of the final pipeline--just for finding best model

In [67]:
knn.score(union.fit_transform(school_train), target_train)

0.72463768115942029

In [None]:
#9) if good, predict y hat with test data
#Note: this will be part of the final pipeline

In [71]:
#predict the model with the test dataset
predictions = knn.predict(final_pipe.transform(school_test))

In [None]:
#10) export predictions to csv to upload to Kaggle

In [72]:
#for now use R's solution:
def evaluation_transformation(dataset, predictions):
    dataset = dataset.join(pd.DataFrame(predictions, columns=['Prediction']))
    dataset[['Id', 'Prediction']].to_csv('submission2.csv', index=False)

In [73]:
evaluation_transformation(school_test, predictions)