In [8]:
import pandas as pd
import os
import re
import numpy as np

# Load the raw Excel file
file_name = 'manual-Department-of-Economic-Development-Jobs-Transport-and-Resources-Output-Performance-Measures-2017-18.xlsx'
file_path = os.path.abspath(os.path.join(os.getcwd(), '../raw_data/budgets/', file_name))
data = pd.read_excel(file_path, header=None, thousands=' ')

data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,department,"Department of Economic Development, Jobs, Tran...",,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,irrelevant,Major Outputs/Deliverables,Unit of measure,2017-18 target,2016-17 expected outcome,2016-17 target,2015-16 actual,2015-16 target,2014-15 actual,2014-15 target,...,2011-12 actual,2011-12 target,2010-11 actual,2010-11 target,2009-10 actual,2009-10 target,2008-09 actual,2008-09 target,2007-08 actual,Notes
3,irrelevant,Performance measures,,,,,,,,,...,,,,,,,,,,
4,category,"More productive, competitive, sustainable and ...",,,,,,,,,...,,,,,,,,,,
5,description,The objective creates the conditions for and s...,,,,,,,,,...,,,,,,,,,,
6,program,Agriculture,,,,,,,,,...,,,,,,,,,,
7,description,This output delivers effective and efficient r...,,,,,,,,,...,,,,,,,,,,
8,metric_type,Quantity,,,,,,,,,...,,,,,,,,,,
9,deliverable,"Animal pest, disease and residue control progr...",number,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,


In [9]:
df = data.copy()
department_label = df.iloc[0, 0]

df2 = df.drop([0, 2])
df3 = df2.dropna(how='all')
text_df = df3.iloc[:, [0, 1]]

text_df


Unnamed: 0,0,1
3,irrelevant,Performance measures
4,category,"More productive, competitive, sustainable and ..."
5,description,The objective creates the conditions for and s...
6,program,Agriculture
7,description,This output delivers effective and efficient r...
8,metric_type,Quantity
9,deliverable,"Animal pest, disease and residue control progr..."
10,deliverable,Applications for intellectual property protection
11,deliverable,Client interactions with land health services
12,irrelevant,The lower 2017-18 target reflects seasonal con...


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

X = text_df.iloc[:, 1]
y = pd.get_dummies(text_df.iloc[:, 0], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vec_basic = CountVectorizer(ngram_range=(1, 2))
classifier = OneVsRestClassifier(LogisticRegression())
pl = Pipeline([
    ('tokenize', vec_basic),
    ('classify', classifier)
])

pl.fit(X_train, y_train)
pl.score(X_test, y_test)


0.9375

In [72]:
y_pred = pl.predict(X_test)

# df_columns = np.insert(y.columns.values, 0, 'text')
# df_dict = {'text': X_test}

test_df = pd.DataFrame(y_pred, columns=y.columns.values)
test_df2 = X_test.to_frame(name='text')

test_df3 = test_df.set_index(test_df2.index)
test_df3['text'] = X_test
test_df3['row_number'] = test_df3.index.values

test_df3

Unnamed: 0,deliverable,description,irrelevant,metric_type,program,protram,text,row_number
91,1,0,0,0,0,0,"Undertake activities to detect, disrupt and di...",91
372,1,0,0,0,0,0,Total output cost,372
596,1,0,0,0,0,0,Total output cost,596
221,0,0,1,0,0,0,The 2016-17 expected outcome is higher than th...,221
653,1,0,0,0,0,0,Road safety programmed works completed within ...,653
574,1,0,0,0,0,0,Level access tram stop upgraded,574
34,1,0,0,0,0,0,Satisfaction rating of industry investors in a...,34
565,0,0,1,0,0,0,The 2016-17 expected outcome is lower than the...,565
212,0,0,1,0,0,0,This performance measure renames the 2016-17 p...,212
248,0,0,1,0,0,0,The higher 2017-18 target reflects additional ...,248


In [73]:
test_df4 = pd.melt(frame=test_df3, id_vars=['text', 'row_number'], value_vars=['deliverable', 'description', 'irrelevant', 'metric_type', 'program'], var_name='label')

test_df5 = test_df4[test_df4['value'] == 1].drop('value', axis=1)

test_df5

Unnamed: 0,text,row_number,label
0,"Undertake activities to detect, disrupt and di...",91,deliverable
1,Total output cost,372,deliverable
2,Total output cost,596,deliverable
4,Road safety programmed works completed within ...,653,deliverable
5,Level access tram stop upgraded,574,deliverable
6,Satisfaction rating of industry investors in a...,34,deliverable
10,Clients engaged with agriculture productivity ...,13,deliverable
11,Scheduled services delivered: metropolitan train,542,deliverable
12,Actual export sales generated as a result of p...,329,deliverable
13,Road vehicle and driver regulation: vehicle an...,619,deliverable


In [78]:
test_df5[test_df5['label'] == 'deliverable']

Unnamed: 0,text,row_number,label
0,"Undertake activities to detect, disrupt and di...",91,deliverable
1,Total output cost,372,deliverable
2,Total output cost,596,deliverable
4,Road safety programmed works completed within ...,653,deliverable
5,Level access tram stop upgraded,574,deliverable
6,Satisfaction rating of industry investors in a...,34,deliverable
10,Clients engaged with agriculture productivity ...,13,deliverable
11,Scheduled services delivered: metropolitan train,542,deliverable
12,Actual export sales generated as a result of p...,329,deliverable
13,Road vehicle and driver regulation: vehicle an...,619,deliverable
