# Predicting School Budgetting Labels from Descriptions

In [1]:
from __future__ import division
from __future__ import print_function
%matplotlib inline

# ignore deprecation warnings in sklearn
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import os
import sys

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

from multilabel import multilabel_sample_dataframe, multilabel_train_test_split
from SparseInteractions import SparseInteractions
from metrics import multi_multi_log_loss

## Load Data


In [2]:
path_to_training_data = os.path.join(os.pardir,
                                     'data',
                                     'TrainingSet.csv')

df = pd.read_csv(path_to_training_data, index_col=0)

print(df.shape)

(400277, 25)


## Resample Data

Sampling down the 400,000 rows to 50,000 for ease of locally handling the huge dataset.

In [3]:
LABELS = ['Function',
          'Use',
          'Sharing',
          'Reporting',
          'Student_Type',
          'Position_Type',
          'Object_Type', 
          'Pre_K',
          'Operating_Status']

NON_LABELS = [c for c in df.columns if c not in LABELS]

SAMPLE_SIZE = 100000

sampling = multilabel_sample_dataframe(df,
                                       pd.get_dummies(df[LABELS]),
                                       size=SAMPLE_SIZE,
                                       min_count=25,
                                       seed=20)

dummy_labels = pd.get_dummies(sampling[LABELS])
print(sampling.shape)

X_train, X_test, y_train, y_test = multilabel_train_test_split(sampling[NON_LABELS],
                                                               dummy_labels,
                                                               0.2,
                                                               min_count=3,
                                                               seed=20)

(100000, 25)


## Preprocessing

We need tools to preprocess our text and numeric data. We'll create those tools here. The `combine_text_columns` function will take a DataFrame of text columns and return a single series where all of the text in the columns has been joined together.

We'll then create `FunctionTransformer` objects that select our text and numeric data from the dataframe.

Finally, we create a custom scoring method that uses the `multi_multi_log_loss` function that is the evaluation metric. This scoring method is based off the metric provided on DrivenData to test the model.

In [4]:
NUMERIC_COLUMNS = ['FTE', "Total"]

def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
    """ Takes the dataset as read in, drops the non-feature, non-text columns and
        then combines all of the text columns into a single vector that has all of
        the text for a row.
        
        :param data_frame: The data as read in with read_csv (no preprocessing necessary)
        :param to_drop (optional): Removes the numeric and label columns by default.
    """
    # drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    
    # replace nans with blanks
    text_data.fillna("", inplace=True)
    
    # joins all of the text items in a row (axis=1)
    # with a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)


In [5]:
from sklearn.preprocessing import FunctionTransformer

get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

In [6]:
get_text_data.fit_transform(sampling.head(5))

455     BONUSES                          LIBRARY/MEDIA...
483        Food Service Worker - Regular       FOOD SE...
978     WATER, SEWER AND CLEANING SERVICES  MISCELLANE...
980     NON-CAPITAL EQUIPMENT TEACHER EFFECTIVENESS FE...
1200    EMPLOYEE BENEFITS ITEMGG-TECHNOLOGY APPLICATIO...
dtype: object

In [7]:
get_numeric_data.fit_transform(sampling.head(5))

Unnamed: 0,FTE,Total
455,,1130.54
483,0.73,20705.91
978,,31116.93
980,,9581.54
1200,,1170.78


In [8]:
from sklearn.metrics.scorer import make_scorer

log_loss_scorer = make_scorer(multi_multi_log_loss)

# Train model pipeline

Now we'll train the final pipeline from the course that takes text and numeric data, does the necessary preprocessing, and trains the classifier.

In [9]:
from sklearn.feature_selection import chi2, SelectKBest

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import Imputer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import accuracy_score
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

In [10]:
%%time

# set a reasonable number of features before adding interactions
chi_k = 300

# create the pipeline object

# feature union to apply different types of transformers for text and numerical data
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                     alternate_sign=False, norm=None, binary=False,
                                                     ngram_range=(1, 2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

# fit the pipeline to our training data
pl.fit(X_train, y_train.values)
y_pred=pl.predict(X_test)

# print the score of our trained pipeline on our test set
print("Logloss score of trained pipeline: ", log_loss_scorer(pl, X_test, y_test.values))



Logloss score of trained pipeline:  1.9990740673226997
CPU times: user 33min 25s, sys: 2min 21s, total: 35min 47s
Wall time: 28min 32s
