In [1]:
import pandas as pd
import os
import re
import numpy as np

# Load the raw Excel file
file_name = 'manual-Department-of-Economic-Development-Jobs-Transport-and-Resources-Output-Performance-Measures-2017-18.xlsx'
file_path = os.path.abspath(os.path.join(os.getcwd(), '../raw_data/budgets/', file_name))
data = pd.read_excel(file_path, header=None, thousands=' ')

data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,department,"Department of Economic Development, Jobs, Tran...",,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,irrelevant,Major Outputs/Deliverables,Unit of measure,2017-18 target,2016-17 expected outcome,2016-17 target,2015-16 actual,2015-16 target,2014-15 actual,2014-15 target,...,2011-12 actual,2011-12 target,2010-11 actual,2010-11 target,2009-10 actual,2009-10 target,2008-09 actual,2008-09 target,2007-08 actual,Notes
3,irrelevant,Performance measures,,,,,,,,,...,,,,,,,,,,
4,category,"More productive, competitive, sustainable and ...",,,,,,,,,...,,,,,,,,,,
5,description,The objective creates the conditions for and s...,,,,,,,,,...,,,,,,,,,,
6,program,Agriculture,,,,,,,,,...,,,,,,,,,,
7,description,This output delivers effective and efficient r...,,,,,,,,,...,,,,,,,,,,
8,metric_type,Quantity,,,,,,,,,...,,,,,,,,,,
9,deliverable,"Animal pest, disease and residue control progr...",number,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,


In [244]:
department_label = df.iloc[0, 0]

df2 = data.drop([0, 2]).dropna(how='all')

# text_df = df2.iloc[:, [0, 1]]
labels = df2.iloc[:, 0]
texts = df2.iloc[:, 1] + df2.iloc[:, 2].apply(lambda x: ' ' + str(x))
text_df = pd.DataFrame({'labels': labels, 'texts': texts})

lengths = text_df.iloc[:, 1].apply(lambda x: len(x))
text_df = text_df.assign(lengths=lengths)

counts = text_df.iloc[:, 1].apply(lambda x: len(x) - len(x.replace(' ', '')) + 1)
text_df = text_df.assign(counts=counts)

text_df

Unnamed: 0,labels,texts,lengths,counts
3,irrelevant,Performance measures nan,24,3
4,category,"More productive, competitive, sustainable and ...",289,32
5,description,The objective creates the conditions for and s...,6460,911
6,program,Agriculture nan,15,2
7,description,This output delivers effective and efficient r...,2647,362
8,metric_type,Quantity nan,12,2
9,deliverable,"Animal pest, disease and residue control progr...",187,25
10,deliverable,Applications for intellectual property protect...,56,6
11,deliverable,Client interactions with land health services ...,52,7
12,irrelevant,The lower 2017-18 target reflects seasonal con...,108,16


In [246]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import Imputer

X = text_df.iloc[:, 1:4]
y = pd.get_dummies(text_df.iloc[:, 0], drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

get_text_data = FunctionTransformer(lambda x: x['texts'], validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[['counts', 'lengths']], validate=False)
vec_basic = CountVectorizer(ngram_range=(1, 2), stop_words='english')

classifier = OneVsRestClassifier(LogisticRegression(class_weight='balanced'))
# classifier = OneVsRestClassifier(RandomForestClassifier(class_weight='balanced'))

text_pipeline = Pipeline([
    ('selector', get_text_data),
    ('tokenize', vec_basic)
])
numeric_pipeline = Pipeline([
    ('selector', get_numeric_data)
#     ('impute', Imputer)
])
union_pipeline = FeatureUnion([
    ('numeric', numeric_pipeline),
    ('text', text_pipeline)
])
pl = Pipeline([
    ('union', union_pipeline),
    ('classify', classifier)
])

pl.fit(X_train, y_train)
pl.score(X_test, y_test)

0.9765625

In [247]:
y_pred = pl.predict(X_test)

test_df = pd.DataFrame(y_pred, columns=y.columns.values)

test_df3 = test_df.set_index(X_test.index)
test_df3['texts'] = X_test.iloc[:, 0]
test_df3['row_number'] = test_df3.index.values

test_df4 = pd.melt(frame=test_df3, id_vars=['texts', 'row_number'], value_vars=['deliverable', 'description', 'irrelevant', 'metric_type', 'program'], var_name='labels')

test_df5 = test_df4[test_df4['value'] == 1].drop('value', axis=1)

test_df5

Unnamed: 0,texts,row_number,labels
0,Major periodic maintenance works completed aga...,561,deliverable
2,Total output cost $ million,138,deliverable
5,"Develop, implement and review overarching fish...",82,deliverable
9,Significant interactions with Victorian agri-f...,332,deliverable
11,Risk-based recreational vessel inspections und...,606,deliverable
12,Minimum number of Uniformed Fisheries Officers...,85,deliverable
13,Regulatory audits completed within agreed time...,68,deliverable
15,All facility safety audits conducted number,179,deliverable
16,Safer Cyclists and Pedestrians Fund allocated ...,479,deliverable
17,Enhance levels of community participation in a...,83,deliverable


In [249]:
test_df5[test_df5['labels'] == 'program']

Unnamed: 0,texts,row_number,labels
586,Road Asset Management nan,413,program
588,Trade nan,326,program
605,Jobs and Investment nan,262,program
