# Setup Notebook

In [1]:
import warnings
warnings.simplefilter('ignore')

# To load data
import zipfile
import pandas as pd

# To build model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, Imputer, MaxAbsScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

# To evaluate model
from sklearn.metrics import log_loss

# To track time elapsed
import time

# To save results
import dill
import json

In [2]:
# Taken from https://github.com/drivendataorg/box-plots-sklearn/blob/master/src/features/SparseInteractions.py
# Use SparseInteractions with sparse matrices

from itertools import combinations

import numpy as np
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin

class SparseInteractions(BaseEstimator, TransformerMixin):
    def __init__(self, degree=2, feature_name_separator="_"):
        self.degree = degree
        self.feature_name_separator = feature_name_separator

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not sparse.isspmatrix_csc(X):
            X = sparse.csc_matrix(X)

        if hasattr(X, "columns"):
            self.orig_col_names = X.columns
        else:
            self.orig_col_names = np.array([str(i) for i in range(X.shape[1])])

        spi = self._create_sparse_interactions(X)
        return spi

    def get_feature_names(self):
        return self.feature_names

    def _create_sparse_interactions(self, X):
        out_mat = []
        self.feature_names = self.orig_col_names.tolist()

        for sub_degree in range(2, self.degree + 1):
            for col_ixs in combinations(range(X.shape[1]), sub_degree):
                # add name for new column
                name = self.feature_name_separator.join(self.orig_col_names[list(col_ixs)])
                self.feature_names.append(name)

                # get column multiplications value
                out = X[:, col_ixs[0]]
                for j in col_ixs[1:]:
                    out = out.multiply(X[:, j])

                out_mat.append(out)

        return sparse.hstack([X] + out_mat)


# Load Data

In [3]:
def load_data():
    """
    Return pandas dataframe data_train: training data (features + labels)
    Return pandas dataframe data_test: test data (only features)
    
    Required Libraries: zipfile, pandas
    """
    
    # Load zipped folder with data files
    resource_archive = zipfile.ZipFile('resources.zip', 'r')

    # Load testing data
    data_test = pd.read_csv(resource_archive.open('TestData.csv'), 
                            dtype={
                                'Object_Description': str, 
                                'Program_Description': str, 
                                'SubFund_Description': str, 
                                'Job_Title_Description': str, 
                                'Facility_or_Department': str,
                                'Sub_Object_Description': str, 
                                'Location_Description': str, 
                                'FTE': float,
                                'Function_Description': str, 
                                'Position_Extra': str, 
                                'Text_4': str, 
                                'Total': float, 
                                'Text_2': str,
                                'Text_3': str, 
                                'Fund_Description': str, 
                                'Text_1': str
                            },
                            index_col=0)

    # Load training data
    data_train = pd.read_csv(resource_archive.open('TrainingData.csv'), 
                            dtype={
                                'Object_Description': str, 
                                'Program_Description': str, 
                                'SubFund_Description': str, 
                                'Job_Title_Description': str, 
                                'Facility_or_Department': str,
                                'Sub_Object_Description': str, 
                                'Location_Description': str, 
                                'FTE': float,
                                'Function_Description': str, 
                                'Position_Extra': str, 
                                'Text_4': str, 
                                'Total': float, 
                                'Text_2': str,
                                'Text_3': str, 
                                'Fund_Description': str, 
                                'Text_1': str,
                                'Function': 'category',
                                'Object_Type': 'category',
                                'Operating_Status': 'category',
                                'Position_Type': 'category',
                                'Pre_K': 'category',
                                'Reporting': 'category',
                                'Sharing': 'category',
                                'Student_Type': 'category',
                                'Use': 'category',
                            },
                             index_col=0)
    
    return data_train, data_test

In [4]:
data_train, data_test = load_data()
print('data_train shape:', data_train.shape)
print('data_test shape:', data_test.shape)

data_train shape: (400277, 25)
data_test shape: (50064, 16)


# Prepare Data for Classification

In [5]:
def prep_for_classification(data_train, data_test, validation_size=50064):
    """
    Split training data into training and validation sets
    Split data into features and labels
    
    Return pandas dataframe X_train: training data (features)
    Return pandas dataframe X_val: validation data (features)
    Return pandas dataframe X_test: test data (only features)
    Return pandas dataframe y_train: training data (labels)
    Return pandas dataframe y_val: validation data (labels)
    
    param pandas dataframe data_train: training data (features + labels)
    param pandas dataframe data_test: test data (features)
    param numerical validation_size: size of validation set
    
    Required Libraries: pandas, sklearn.model_selection
    """
    
    # List features and labels
    features = [
        'FTE', 
        'Facility_or_Department', 
        'Function_Description', 
        'Fund_Description', 
        'Job_Title_Description', 
        'Location_Description', 
        'Object_Description', 
        'Position_Extra', 
        'Program_Description', 
        'SubFund_Description', 
        'Sub_Object_Description', 
        'Text_1', 
        'Text_2', 
        'Text_3', 
        'Text_4', 
        'Total']

    labels = [
        'Function', 
        'Object_Type', 
        'Operating_Status', 
        'Position_Type', 
        'Pre_K', 
        'Reporting', 
        'Sharing', 
        'Student_Type', 
        'Use']
    
    # Separate features (X) and labels (y)
    X = data_train[features]
    X_test = data_test[features]
    y = pd.get_dummies(data_train[labels], prefix_sep='__')
    
    # Split into training and development sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=validation_size, random_state=93)
    
    return X_train, X_val, X_test, y_train, y_val

In [6]:
X_train, X_val, X_test, y_train, y_val = prep_for_classification(data_train, data_test)

# Build Model

In [87]:
def build_model():
    """
    Return sklearn pipeline clf_pipeline: 
    
        
    Required Libraries:
    pandas
    sklearn.preprocessing
    sklearn.pipeline
    sklearn.feature_extraction.text
    sklearn.feature_selection
    sklearn.multiclass
    sklearn.linear_model
    """
    # Numeric feature preprocessing
    select_numeric_features = FunctionTransformer(lambda x: x[['FTE', 'Total']], validate=False)
    numeric_preprocess_pipeline = Pipeline([
        ('selector', select_numeric_features),
        ('handle_missing_values', Imputer())
    ])

    # Text feature preprocessing
    def combine_text_columns(df_train):
        return df_train.drop(columns=['FTE', 'Total']).fillna("").apply(lambda x: " ".join(x), axis=1)

    prepare_text_features = FunctionTransformer(lambda x: combine_text_columns(x), validate=False)
    text_preprocess_pipeline = Pipeline([
        ('combine_text', prepare_text_features),
        ('vectorize', HashingVectorizer(token_pattern='[A-Za-z0-9]+(?=\\s+)',
                                        non_negative=True,
                                        norm=None,
                                        binary=False, 
                                        ngram_range=(1, 2))),
        ('dim_red', SelectKBest(chi2, 300))
    ])

    # Combine numeric and text feature preprocessing
    preprocess_pipeline = FeatureUnion(transformer_list = [
        ('numeric_preprocess', numeric_preprocess_pipeline),
        ('text_preprocess', text_preprocess_pipeline)
    ])

    # Build model
    clf_pipeline = Pipeline([
        ('preprocess', preprocess_pipeline),
        ('feature_interactions', SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()), 
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])
    
    return clf_pipeline

In [88]:
datacamp_model = build_model()

# Train model on each label and generate predictions on test set

In [89]:
label_class_counts = {
    'Function': [0, 37], 
    'Object_Type': [37, 48], 
    'Operating_Status': [48, 51], 
    'Position_Type': [51, 76], 
    'Pre_K': [76, 79], 
    'Reporting': [79, 82], 
    'Sharing': [82, 87], 
    'Student_Type': [87, 96], 
    'Use': [96, 104]
}

scores = {}
predictions = np.zeros(shape=(X_test.shape[0], 104))

for label, indices in label_class_counts.items():
    start_time = time.time()
    print("Completing model for", label, "...")
    # Get values for specific label
    start_idx = indices[0]
    end_idx = indices[1]
    y_train_label = y_train.values[:, start_idx:end_idx]
    y_val_label = y_val.values[:, start_idx:end_idx]
    
    # Train model
    datacamp_model.fit(X_train, y_train_label)
    print("Model for", label, "trained...", "time elapsed", time.time() - start_time)
    
    # Get logloss score of mode
    validation_predictions = datacamp_model.predict_proba(X_val)
    scores[label] = log_loss(y_val_label, validation_predictions)
    
    # Generate predictions on test set
    predictions[:, start_idx:end_idx] = datacamp_model.predict_proba(X_test)
    print("Predictions for", label, "generated...", "time elapsed", time.time() - start_time)
    
    # Save model
    with open('datacamp_' + label + '.pkl', 'wb') as fid:
        dill.dump(datacamp_model, fid) 

Completing model for Function ...
Model for Function trained... time elapsed 690.8775169849396
Predictions for Function generated... time elapsed 755.75226521492
Completing model for Object_Type ...
Model for Object_Type trained... time elapsed 928.5800788402557
Predictions for Object_Type generated... time elapsed 999.149539232254
Completing model for Operating_Status ...
Model for Operating_Status trained... time elapsed 222.13770008087158
Predictions for Operating_Status generated... time elapsed 294.9519290924072
Completing model for Position_Type ...
Model for Position_Type trained... time elapsed 606.0556590557098
Predictions for Position_Type generated... time elapsed 687.3139142990112
Completing model for Pre_K ...
Model for Pre_K trained... time elapsed 561.3521637916565
Predictions for Pre_K generated... time elapsed 625.9415218830109
Completing model for Reporting ...
Model for Reporting trained... time elapsed 366.9106650352478
Predictions for Reporting generated... time el

# Save score and predictions

In [90]:
# Save predictions for test set
submission = pd.DataFrame(predictions, index=X_test.index, columns=y_train.columns)
submission.to_csv('datacamp_model.csv')

In [91]:
scores

{'Function': 0.5444172744127479,
 'Object_Type': 0.12562586497405154,
 'Operating_Status': 0.06815703676624803,
 'Position_Type': 0.32486287814541626,
 'Pre_K': 0.0456704909623151,
 'Reporting': 0.11312726754702154,
 'Sharing': 0.16476345311088206,
 'Student_Type': 0.1604987108904824,
 'Use': 0.2176736817876597}

In [92]:
np.mean([score for score in scores.values()])

0.19608851762186935

In [93]:
with open('datacamp_score.json', 'w') as file:
     file.write(json.dumps(scores))

In [None]:
label_class_counts = {
    'Function': [0, 37], 
    'Object_Type': [37, 48], 
    'Operating_Status': [48, 51], 
    'Position_Type': [51, 76], 
    'Pre_K': [76, 79], 
    'Reporting': [79, 82], 
    'Sharing': [82, 87], 
    'Student_Type': [87, 96], 
    'Use': [96, 104]
}

for label, indices in label_class_counts.items():
    files.download('datacamp_' + label + '.pkl')