In [None]:
import pandas as pd

In [None]:
#%pwd

In [None]:
df = pd.read_csv('.\\Data\\trainingdata_schoolbudget.csv', index_col=0)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(df['FTE'].dropna())
plt.title('Distribution of %full-time\n employee works')
plt.xlabel('% of full-time')
plt.ylabel('num employees')
plt.show()

In [None]:
LABELS=['Function',
 'Use',
 'Sharing',
 'Reporting',
 'Student_Type',
 'Position_Type',
 'Object_Type',
 'Pre_K',
 'Operating_Status']

NUMERIC_COLUMNS=['FTE', 'Total']

In [None]:
df[LABELS].dtypes

In [None]:
#categorizing the labels
categorize_label = lambda x: x.astype('category')

In [None]:
df[LABELS] = df[LABELS].apply(categorize_label, axis=0)

In [None]:
df[LABELS].dtypes

In [None]:
#counting unique labels
num_unique_labels = df[LABELS].apply(pd.Series.nunique, axis=0)
#print(sum(num_unique_labels))
_ = num_unique_labels.plot(kind='bar')
_ = plt.xlabel('label')
_ = plt.ylabel('num unique labels')
plt.show()

In [None]:
import numpy as np

def compute_log_loss(predicted, actual, eps=1e-14):
    """ Computes the logarithmic loss between predicted and actual
        when these are 1D arrays.
        :param predicted: the predicted probabilites as float between 0 - 1
        :param actual: the actual binary label 0 - 1
        :param eps: log(0) is inf, so to offset predicted values slightly by eps
    """
    predicted = np.clip(predicted, eps, 1 - eps)
    loss = -1 * np.mean(actual * np.log(predicted)
                       + (1 - actual)
                       * np.log(1 - predicted))
    return loss

In [None]:
print("Log Loss, wrong and confident: {}".format(compute_log_loss(0.9, 0)))
print("Log Loss, correct and confident: {}".format(compute_log_loss(0.9, 1)))
print("Log Loss, wrong and not confident: {}".format(compute_log_loss(0.5, 1)))
print("Log Loss, correct and not confident: {}".format(compute_log_loss(0.5, 0)))

In [None]:
#import myutils.multilabel
#help(myutils)
#from myutils import multilabel
import myutils
#help(multilabel.multilabel_sample)
#help(myutils)
help(myutils.multilabel_train_test_split)

In [None]:
numeric_data = df[NUMERIC_COLUMNS].fillna(-1000)
label_dummies = pd.get_dummies(df[LABELS])

X_train, X_test, y_train, y_test = myutils.multilabel_train_test_split(numeric_data, label_dummies, size=0.2, seed=123)
#print("X_train info: {}".format(X_train.info()))
#print("-------------------")
#print("X_test info: {}".format(X_test.info()))
#print("-------------------")
#print("y_train info: {}".format(y_train.info()))
#print("-------------------")
#print("y_test info: {}".format(y_test.info()))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

clf = OneVsRestClassifier(LogisticRegression(solver='lbfgs'))

clf.fit(X_train, y_train)
print("Accuracy: {}".format(clf.score(X_test, y_test)))

In [None]:
holdout = pd.read_csv('./Data/HoldoutData_schoolbudget.csv', index_col=0)

In [None]:
holdout = holdout[NUMERIC_COLUMNS].fillna(-1000)

In [None]:
predictions = clf.predict_proba(holdout)

In [None]:
cols = pd.get_dummies(df[LABELS], prefix_sep='__').columns
predictions = pd.DataFrame(data=predictions, index=holdout.index, columns=cols)

In [None]:
predictions.head()

In [None]:
#df.columns - LABELS
#df.Text_1
#LABELS
#set(df.columns) - set(LABELS + NUMERIC_COLUMNS)

In [None]:
def combine_text_columns(data_frame, to_drop=LABELS+NUMERIC_COLUMNS):
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    text_data.fillna('', inplace=True)
    return text_data.apply(lambda x: " ".join(x), axis=1)

In [None]:
text_data = combine_text_columns(df)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
BASIC_PATTERN = '\\S+(?=\\s+)'
ALPHANUMERIC_PATTERN = '[A-Za-z0-9]+(?=\\s+)'

vec_basic = CountVectorizer(token_pattern=BASIC_PATTERN)
vec_alphanumeric = CountVectorizer(token_pattern=ALPHANUMERIC_PATTERN)

vec_basic.fit_transform(text_data)
print("There are {} tokens in the dataset".format(len(vec_basic.get_feature_names())))

vec_alphanumeric.fit_transform(text_data)
print("There are {} tokens in the dataset".format(len(vec_alphanumeric.get_feature_names())))

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Imputer
from sklearn import impute

dummy_labels = pd.get_dummies(df[LABELS])

NON_LABELS = [c for c in df.columns if c not in LABELS]

X_train, X_test, y_train, y_test = myutils.multilabel_train_test_split(df[NON_LABELS], dummy_labels, 0.2, seed=123)

In [None]:
get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)

pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', impute.SimpleImputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer())
        ]))
    ])),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='lbfgs', multi_class='multinomial', C=0.01, max_iter=1000)))
])

pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)

In [None]:
from sklearn.ensemble import RandomForestClassifier
pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', impute.SimpleImputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer())
        ]))
    ])),
    ('clf', OneVsRestClassifier(RandomForestClassifier(n_estimators=30)))
])

pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)

In [None]:
pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', impute.SimpleImputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer())
        ]))
    ])),
    ('clf', OneVsRestClassifier(RandomForestClassifier(n_estimators=15)))
])

pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)

In [None]:
pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', impute.SimpleImputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer(token_pattern=ALPHANUMERIC_PATTERN, ngram_range=(1, 2)))
        ]))
    ])),
    ('clf', OneVsRestClassifier(RandomForestClassifier(n_estimators=35)))
])

pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', impute.SimpleImputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', HashingVectorizer(token_pattern=ALPHANUMERIC_PATTERN, norm=None, ngram_range=(1, 2)))
        ]))
    ])),
    ('clf', OneVsRestClassifier(RandomForestClassifier(n_estimators=35)))
])

pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)

In [None]:
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import MaxAbsScaler

chi_k = 300

pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', impute.SimpleImputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer(token_pattern=ALPHANUMERIC_PATTERN, ngram_range=(1, 2))),
            ('dim_red', SelectKBest(chi2, chi_k))
        ]))
    ])),
    ('scale', MaxAbsScaler()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='lbfgs', dual=False, multi_class='multinomial')))
])

pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)

In [None]:
import os
import sys
#print(os.getcwd())
#print(sys.path)
src_dir = os.path.join(os.getcwd(), 'myutils')
#print(src_dir)
sys.path.append(src_dir)
from myutils.sparseinteractions import SparseInteractions

In [None]:
%%time
chi_k = 300

pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', impute.SimpleImputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', CountVectorizer(token_pattern=ALPHANUMERIC_PATTERN, ngram_range=(1, 2))),
            ('dim_red', SelectKBest(chi2, chi_k))
        ]))
    ])),
    ('int', SparseInteractions(degree=2)),
    ('scale', MaxAbsScaler()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='lbfgs', dual=False, multi_class='multinomial')))
])

pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)

In [None]:
%%time
from sklearn.feature_extraction.text import HashingVectorizer

chi_k = 300

pl = Pipeline([
    ('union', FeatureUnion(
    transformer_list=[
        ('numeric_features', Pipeline([
            ('selector', get_numeric_data),
            ('imputer', impute.SimpleImputer())
        ])),
        ('text_features', Pipeline([
            ('selector', get_text_data),
            ('vectorizer', HashingVectorizer(token_pattern=ALPHANUMERIC_PATTERN, non_negative=True, norm=None, ngram_range=(1, 2))),
            ('dim_red', SelectKBest(chi2, chi_k))
        ]))
    ])),
    ('int', SparseInteractions(degree=2)),
    ('scale', MaxAbsScaler()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='lbfgs', max_iter=500, multi_class='multinomial')))
])

pl.fit(X_train, y_train)
accuracy = pl.score(X_test, y_test)

print("\nAccuracy: ", accuracy)