In [1]:
# Set Working Directory
import os
os.chdir('..')

In [2]:
# Load Requirements
import warnings
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from tpot import TPOTClassifier
from sklearn.metrics import recall_score, precision_score
from sklearn.externals import joblib

# Prepare data for classificaiton

In [3]:
# Load Data
wrangled_data = pd.read_csv('output/wrangled_data_ii.csv', dtype='str', na_values='NaN')

In [4]:
def prep_columns_classification(wrangled_data_ii, feature_set=None):
    """
    Return X_train, X_dev, X_test, y_train, y_dev, y_test: predictors (X) and label (y) of train, dev, and test sets (0.8, 0.1, 0.1 split)

    param dataframe wrangled_data_ii: dataframe of wrangled dataframe (after rename/reduce columns)
    param list feature_set: list of features to include
    """

    prepped_classification_data = wrangled_data_ii.copy()

    # Specify features to include in model
    if feature_set:
        pass
    else:
        feature_set = wrangled_data_ii.drop(columns=['NAME', 'LEAID', 'exist_five_years']).columns
    X = wrangled_data_ii[feature_set].copy()

    # Identify column types
    identifying_columns = ['NAME', 'LEAID']
    prediction_columns = ['exist_five_years']
    categorical_columns = ['lowest_grade', 'highest_grade', 'charter_status']
    boolean_columns = ['bureau_indian_education']
    numerical_columns = []

    # identify numerical columns
    for column in X.columns:
        if column in identifying_columns or column in categorical_columns or column in boolean_columns or column in prediction_columns:
            pass
        elif len(X[column].unique()) > 100:
            numerical_columns.append(column)
        else:
            categorical_columns.append(column)

    X[numerical_columns] = X[numerical_columns].astype(float)

    # one hot encode categorical variables
    X = pd.get_dummies(X, prefix_sep='_', columns=categorical_columns, drop_first=True)

    # Split into train and test sets
    y = wrangled_data_ii[prediction_columns].apply(lambda x: x=='False')
    y = y.values.ravel()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)
    X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=21)

    return X_train, y_train, X_dev, y_dev, X_test, y_test

In [5]:
X_train, y_train, X_dev, y_dev, X_test, y_test = prep_columns_classification(wrangled_data)

In [6]:
y = np.concatenate([y_train, y_dev, y_test])
sum(y)/len(y)

0.06011372867587327

# Test models

In [7]:
def test_model(clf_pipeline, X_train, y_train, X_dev, y_dev):
    """
    Return sklearn pipeline object clf_pipeline: trained sklearn pipeline
    Return recall_train, recall_dev, precision_train, precision_dev: recall and precision of training and development sets
    
    param sklearn pipeline object clf_pipeline: untrained sklearn pipeline
    param np.array X_train, y_train, X_dev, y_dev: feature (X) and labels (y) of training and development sets
    """
    
    warnings.filterwarnings('ignore', category=DeprecationWarning)
    
    # Fit pipeline using training data
    clf_pipeline.fit(X_train, y_train) 

    # Get predictions for training and development sets
    train_predictions = clf_pipeline.predict(X_train)
    dev_predictions = clf_pipeline.predict(X_dev)
    
    # Calculate recall and precision of training and development sets
    recall_train = recall_score(y_train, train_predictions)
    recall_dev = recall_score(y_dev, dev_predictions)
    precision_train = precision_score(y_train, train_predictions)
    precision_dev = precision_score(y_dev, dev_predictions)

    return clf_pipeline, recall_train, recall_dev, precision_train, precision_dev

In [8]:
wrangled_data_woNA = wrangled_data.dropna().copy()
X_train_woNA, y_train_woNA, X_dev_woNA, y_dev_woNA, X_test_woNA, y_test_woNA = prep_columns_classification(wrangled_data_woNA)

baseline_pipeline = Pipeline([ 
    ('clf', LogisticRegression())
])

baseline_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(baseline_pipeline, X_train_woNA, y_train_woNA, X_dev_woNA, y_dev_woNA)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Recall on training data:       0.018
Recall on development data:    0.043478260869565216
Precision on training data:    0.2571428571428571
Precision on development data: 0.42857142857142855


In [9]:
minmax_pipeline = Pipeline([ 
    ('scaling', MinMaxScaler()),
    ('clf', LogisticRegression())
])
minmax_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(minmax_pipeline, X_train_woNA, y_train_woNA, X_dev_woNA, y_dev_woNA)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Recall on training data:       0.134
Recall on development data:    0.11594202898550725
Precision on training data:    0.9710144927536232
Precision on development data: 1.0


In [10]:
weighting_pipeline = Pipeline([ 
    ('scaling', MinMaxScaler()),
    ('clf', LogisticRegression(class_weight='balanced'))
])
weighting_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(weighting_pipeline, X_train_woNA, y_train_woNA, X_dev_woNA, y_dev_woNA)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Recall on training data:       0.808
Recall on development data:    0.8405797101449275
Precision on training data:    0.1805183199285076
Precision on development data: 0.22137404580152673


In [11]:
XGB_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', XGBClassifier(scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train)))
])
XGB_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(XGB_pipeline, X_train_woNA, y_train_woNA, X_dev_woNA, y_dev_woNA)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Recall on training data:       0.936
Recall on development data:    0.7681159420289855
Precision on training data:    0.2881773399014778
Precision on development data: 0.2849462365591398


In [12]:
PCA_pipeline = Pipeline([
    ('pca', PCA(n_components=20)),
    ('scaler', MinMaxScaler()),
    ('clf', XGBClassifier(scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train)))
])
PCA_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(PCA_pipeline, X_train_woNA, y_train_woNA, X_dev_woNA, y_dev_woNA)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Recall on training data:       0.872
Recall on development data:    0.6811594202898551
Precision on training data:    0.21920563097033685
Precision on development data: 0.1821705426356589


In [13]:
impute_pipeline = Pipeline([
    ('imp', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
    ('scaler', MinMaxScaler()),
    ('clf', XGBClassifier(scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train)))
])
impute_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(impute_pipeline, X_train, y_train, X_dev, y_dev)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Recall on training data:       0.9395667046750285
Recall on development data:    0.8
Precision on training data:    0.28084526244035446
Precision on development data: 0.2413793103448276


In [14]:
impute_pipeline = Pipeline([
    ('imp', Imputer(missing_values='NaN', strategy='mean', axis=0)),
    ('scaler', MinMaxScaler()),
    ('clf', XGBClassifier(scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train)))
])
impute_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(impute_pipeline, X_train, y_train, X_dev, y_dev)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Recall on training data:       0.9179019384264538
Recall on development data:    0.8095238095238095
Precision on training data:    0.28505665722379603
Precision on development data: 0.23943661971830985


In [15]:
impute_pipeline = Pipeline([
    ('imp', Imputer(missing_values='NaN', strategy='median', axis=0)),
    ('scaler', MinMaxScaler()),
    ('clf', XGBClassifier(scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train)))
])
impute_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(impute_pipeline, X_train, y_train, X_dev, y_dev)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Recall on training data:       0.9372862029646523
Recall on development data:    0.8095238095238095
Precision on training data:    0.276674520363514
Precision on development data: 0.23480662983425415


In [16]:
features_10 = ['total_local_revenue', 
               'total_state_revenue', 
               'total_federal_revenue',
               'teachers_total', 
               'charter_status', 
               'white_students', 
               'total_schools',
               'total_students',
               'lowest_grade', 
               'highest_grade']

X_train, y_train, X_dev, y_dev, X_test, y_test = prep_columns_classification(wrangled_data, features_10)

In [17]:
tenfeatures_pipeline = Pipeline([
    ('imp', Imputer(missing_values='NaN', strategy='median', axis=0)),
    ('scaler', MinMaxScaler()),
    ('clf', XGBClassifier(scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train)))
])
tenfeatures_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(tenfeatures_pipeline, X_train, y_train, X_dev, y_dev)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Recall on training data:       0.8620296465222349
Recall on development data:    0.7904761904761904
Precision on training data:    0.2644281217208814
Precision on development data: 0.2292817679558011


In [18]:
features_15 = ['total_local_revenue', 
               'total_state_revenue', 
               'total_federal_revenue',
               'teachers_total', 
               'charter_status', 
               'white_students', 
               'total_schools',
               'total_students',
               'lowest_grade', 
               'highest_grade', 
               'state_name', 
               'total_expenditure', 
               'administrators_school', 
               'metro_micro', 
               'white_male_students']

X_train, y_train, X_dev, y_dev, X_test, y_test = prep_columns_classification(wrangled_data, features_15)

In [19]:
fifteenfeatures_pipeline = Pipeline([
    ('imp', Imputer(missing_values='NaN', strategy='median', axis=0)),
    ('scaler', MinMaxScaler()),
    ('clf', XGBClassifier(scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train)))
])
fifteenfeatures_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(fifteenfeatures_pipeline, X_train, y_train, X_dev, y_dev)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Recall on training data:       0.8962371721778791
Recall on development data:    0.8476190476190476
Precision on training data:    0.2661699966136133
Precision on development data: 0.23177083333333334


In [20]:
filename = 'output/pipelines/classify_pipeline.sav'
joblib.dump(fifteenfeatures_pipeline, filename)

['output/pipelines/classify_pipeline.sav']

In [21]:
X = pd.concat([X_train, X_dev, X_test])
y = np.concatenate([y_train, y_dev, y_test])
predictions = fifteenfeatures_pipeline.predict(X)
classification_frame = pd.DataFrame({'LEAID': wrangled_data.loc[X.index, 'LEAID'], 
                                     'NAME': wrangled_data.loc[X.index, 'NAME'], 
                                     'Close_Five_Years_Actual': y, 
                                     'Close_Five_Years_Prediction': predictions})
classification_frame.to_csv('output/classification_results.csv', index=False)

# Use Features from EDA

In [22]:
# Load Data
wrangled_data = pd.read_csv('output/wrangled_data_ii.csv', dtype='str', na_values='NaN')

keep_features = ['total_students',
                 'total_schools',
                 'teachers_total',
                 'total_revenue',
                 'total_federal_revenue',
                 'total_state_revenue',
                 'total_local_revenue',
                 'total_expenditure',
                 'total_salaries',
                 'white_students',
                 'lowest_grade',
                 'highest_grade',
                 'metro_micro',
                 'charter_status',
                 'state_name']


X_train, y_train, X_dev, y_dev, X_test, y_test = prep_columns_classification(wrangled_data, keep_features)

In [23]:
clf_pipeline = Pipeline([
    ('imp', Imputer(missing_values='NaN', strategy='median', axis=0)),
    ('scaler', MinMaxScaler()),
    ('clf', XGBClassifier(scale_pos_weight=(len(y_train)-sum(y_train))/sum(y_train)))
])
clf_pipeline, recall_train, recall_dev, precision_train, precision_dev = test_model(clf_pipeline, X_train, y_train, X_dev, y_dev)

print("Recall on training data:       {}".format(recall_train))
print("Recall on development data:    {}".format(recall_dev))
print("Precision on training data:    {}".format(precision_train))
print("Precision on development data: {}".format(precision_dev))

Recall on training data:       0.8962371721778791
Recall on development data:    0.8571428571428571
Precision on training data:    0.26061007957559684
Precision on development data: 0.2349869451697128


In [24]:
filename = 'output/pipelines/classify_pipeline.sav'
joblib.dump(clf_pipeline, filename)

['output/pipelines/classify_pipeline.sav']

In [25]:
X = pd.concat([X_train, X_dev, X_test])
y = np.concatenate([y_train, y_dev, y_test])
predictions = clf_pipeline.predict(X)
classification_frame = pd.DataFrame({'LEAID': wrangled_data.loc[X.index, 'LEAID'], 
                                     'NAME': wrangled_data.loc[X.index, 'NAME'], 
                                     'Close_Five_Years_Actual': y, 
                                     'Close_Five_Years_Prediction': predictions})
classification_frame.to_csv('output/classification_results.csv', index=False)