In [1]:
# Set Working Directory
import os
os.chdir('..')

In [2]:
# Load Requirements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, Imputer, MaxAbsScaler, MinMaxScaler
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from tpot import TPOTClassifier
import dill

from helper import *

In [3]:
# Load testing data
data_test = pd.read_csv('data/TestData.csv', index_col=0)

# Load training data
data = pd.read_csv('data/TrainingData.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# List features and labels

features = ['FTE', 
            'Facility_or_Department', 
            'Function_Description', 
            'Fund_Description', 
            'Job_Title_Description', 
            'Location_Description', 
            'Object_Description', 
            'Position_Extra', 
            'Program_Description', 
            'SubFund_Description', 
            'Sub_Object_Description', 
            'Text_1', 
            'Text_2', 
            'Text_3', 
            'Text_4', 
            'Total']

labels = ['Function', 
          'Object_Type', 
          'Operating_Status', 
          'Position_Type', 
          'Pre_K', 
          'Reporting', 
          'Sharing', 
          'Student_Type', 
          'Use']

In [5]:
# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' 

# Setup data for classification

In [6]:
# Convert labels into category data type
data[labels] = data[labels].astype('category')

In [7]:
# Extract and one-hot encode labels
encoded_labels = pd.get_dummies(data[labels], prefix_sep='__')

In [8]:
# Extract features
data_features = data[features]

I chose 50064 records as the size of the development set to match the size of the test set

In [9]:
# Numeric feature preprocessing
select_numeric_features = FunctionTransformer(lambda x: x[['FTE', 'Total']], validate=False)
numeric_preprocess_pipeline = Pipeline([
    ('selector', select_numeric_features),
    ('handle_missing_values', Imputer())
])

# Text feature preprocessing
def combine_text_columns(df_train):
    return df_train.drop(columns=['FTE', 'Total']).fillna("").apply(lambda x: " ".join(x), axis=1)
    
prepare_text_features = FunctionTransformer(lambda x: combine_text_columns(x), validate=False)
text_preprocess_pipeline = Pipeline([
    ('combine_text', prepare_text_features),
    ('vectorize', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                    non_negative=True,
                                    norm=None,
                                    binary=False, 
                                    ngram_range=(1, 2))),
    ('dim_red', SelectKBest(chi2, 100))
])

# Combine numeric and text feature preprocessing
preprocess_pipeline = FeatureUnion(transformer_list = [
    ('numeric_preprocess', numeric_preprocess_pipeline),
    ('text_preprocess', text_preprocess_pipeline)
])

In [10]:
# Train Preprocessing Model
preprocess_pipeline.fit(data_features, encoded_labels)



FeatureUnion(n_jobs=1,
       transformer_list=[('numeric_preprocess', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x1a2a16a510>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecated',
          validate=False)), ('handle_...', tokenizer=None)), ('dim_red', SelectKBest(k=100, score_func=<function chi2 at 0x1a1fbc9950>))]))],
       transformer_weights=None)

In [11]:
encoded_data_features = preprocess_pipeline.transform(data_features)



In [12]:
# Split into training and development sets
X_train, X_dev, y_train, y_dev = train_test_split(encoded_data_features, encoded_labels, test_size=50064, random_state=93)

# Train Classifier

In [13]:
y_dev.columns

Index(['Function__Aides Compensation',
       'Function__Career & Academic Counseling', 'Function__Communications',
       'Function__Curriculum Development',
       'Function__Data Processing & Information Services',
       'Function__Development & Fundraising', 'Function__Enrichment',
       'Function__Extended Time & Tutoring',
       'Function__Facilities & Maintenance', 'Function__Facilities Planning',
       ...
       'Student_Type__Special Education', 'Student_Type__Unspecified',
       'Use__Business Services', 'Use__ISPD', 'Use__Instruction',
       'Use__Leadership', 'Use__NO_LABEL', 'Use__O&M',
       'Use__Pupil Services & Enrichment', 'Use__Untracked Budget Set-Aside'],
      dtype='object', length=104)

In [14]:
for label in y_dev.columns:
    y_train_single = y_train[label]

In [16]:
label = 'Function__Aides Compensation'

y_train_single = y_train[label].values
y_dev_single = y_dev[label].values
clf = LogisticRegression()
# clf = TPOTClassifier(scoring='neg_log_loss', random_state=82, config_dict='TPOT sparse', early_stop=10, verbosity=3, periodic_checkpoint_folder='models/automl_pipelines')
clf.fit(X_train, y_train_single)
clf.score(X_dev, y_dev_single)
# clf.export('models/automl_pipelines/final_'+label+'.py')

0.9600311601150527

In [17]:
print('done')

done
