In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW, TextClassificationPipeline
le = LabelEncoder()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=18)
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [2]:
class DatasetBuilding(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
# config params
data_path = "DATA/Cost_Category_TrainingData.xlsx"
data_sheet = "TrainingData0421"

In [25]:
raw_data = pd.read_excel(data_path, sheet_name=data_sheet)[['Category Description','GWS Insights Benchmarking Categories']].rename(columns={'Category Description':'input','GWS Insights Benchmarking Categories':'cost_category'}).drop_duplicates().reset_index().drop('index', axis=1).dropna()
raw_data.head()

  warn(msg)


Unnamed: 0,input,cost_category
0,Salaries-Cleaning,Cleaning
1,Daily Cleaning Servi,Cleaning
2,Day Porter & Matron,Cleaning
3,Window Cleaning,Cleaning
4,Carpet & Door Mat Clean,Cleaning


In [26]:
raw_data.shape

(1082, 2)

In [27]:
len(raw_data.cost_category.unique())

17

In [28]:
raw_data['label'] = le.fit_transform(raw_data['cost_category'])
raw_data.head()

Unnamed: 0,input,cost_category,label
0,Salaries-Cleaning,Cleaning,0
1,Daily Cleaning Servi,Cleaning,0
2,Day Porter & Matron,Cleaning,0
3,Window Cleaning,Cleaning,0
4,Carpet & Door Mat Clean,Cleaning,0


In [29]:
required_data = raw_data[['input','label']]

In [30]:
train_texts, test_texts, train_labels, test_labels = train_test_split(list(required_data['input']), list(required_data['label']), test_size=.2, shuffle=True)

In [31]:
#train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
#test_texts = list(required_data['input'])
#test_labels = list(required_data['label'])

In [32]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
#val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [33]:
class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = BuildDataset(train_encodings, train_labels)
test_dataset = BuildDataset(test_encodings, test_labels)

In [34]:
model.train()
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)
for epoch in range(5):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()



DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [35]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)



In [36]:
import pickle
with open("embedding_model_5_3.pkl","wb") as f:
    pickle.dump(model, f)

In [37]:
import pickle
with open("label_enc_5_3.pkl","wb") as f:
    pickle.dump(le, f)

In [38]:
predicted_labels = []
for single_text in test_texts:
    predicted_labels.append(int(sorted(pipe(single_text)[0], key=lambda item: item["score"], reverse=True)[0]['label'].split('_')[1]))

In [39]:
res_dict = {'input':test_texts, 'predicted':predicted_labels, 'actual':test_labels}
evaluation_dataframe = pd.DataFrame(res_dict)
evaluation_dataframe['match'] = evaluation_dataframe['predicted']==evaluation_dataframe['actual']

In [40]:
correct_count = sum(evaluation_dataframe['match'].values)
actual_count = len(evaluation_dataframe)
accuracy = correct_count/actual_count
accuracy

0.7004608294930875

In [17]:
# Training 1 = 0.6444444444444445
# Training 2 = 0.7
# Training 3 = 0.6333333333333333
# Training 4 = 0.6888888888888889
# Training 5 = 0.6444444444444445

In [38]:
#(0.6444444444444445+0.7+0.6333333333333333+0.6888888888888889+0.6444444444444445)/5

In [41]:
evaluation_dataframe.to_excel('test_data_results_5_3.xlsx', index=False)

In [42]:
failed_data = evaluation_dataframe[evaluation_dataframe['match']==0]

In [43]:
failed_data['predicted_cat'] =failed_data['predicted'].apply(lambda x: le.inverse_transform([x])[0])
failed_data['actual_cat'] =failed_data['actual'].apply(lambda x: le.inverse_transform([x])[0])
failed_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  failed_data['predicted_cat'] =failed_data['predicted'].apply(lambda x: le.inverse_transform([x])[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  failed_data['actual_cat'] =failed_data['actual'].apply(lambda x: le.inverse_transform([x])[0])


Unnamed: 0,input,predicted,actual,match,predicted_cat,actual_cat
3,Business Services,8,9,False,Other Business Services,Other Office Services
8,Powerwashing,0,6,False,Cleaning,General R&M
11,Landscape Design and Installation,6,12,False,General R&M,Roads & Grounds
13,Data / Internet,9,14,False,Other Office Services,Telecommunications
23,Video Conferencing Providers,14,9,False,Telecommunications,Other Office Services


In [44]:
failed_data.to_excel('full_failed_data_5_3.xlsx', index=False)

In [45]:
evaluation_dataframe['predicted_cat'] =evaluation_dataframe['predicted'].apply(lambda x: le.inverse_transform([x])[0])
evaluation_dataframe['actual_cat'] =evaluation_dataframe['actual'].apply(lambda x: le.inverse_transform([x])[0])
evaluation_dataframe.head()

Unnamed: 0,input,predicted,actual,match,predicted_cat,actual_cat
0,Mobile Vending,9,9,True,Other Office Services,Other Office Services
1,Travel,8,8,True,Other Business Services,Other Business Services
2,Fire Materials,5,5,True,Fire/Life/Safety,Fire/Life/Safety
3,Business Services,8,9,False,Other Business Services,Other Office Services
4,N/R Supplies/Materials,6,6,True,General R&M,General R&M


In [46]:
evaluation_dataframe.to_excel('full_data_results_with_labels_5_3.xlsx', index=False)

In [47]:
def converter(label_list):
    new_list = []
    for val in label_list:
        new_dict = {}
        new_dict[le.inverse_transform([int(val['label'].split('_')[1])])[0]] = round(val['score']*100,2)
        new_list.append(new_dict)
    return new_list

In [48]:
multiple_predictions = failed_data[['input','actual_cat']]
multiple_predictions['all_predictions'] = multiple_predictions['input'].apply(lambda x:converter(sorted(pipe(x)[0], key=lambda item: item["score"], reverse=True)[:5]))
multiple_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_predictions['all_predictions'] = multiple_predictions['input'].apply(lambda x:converter(sorted(pipe(x)[0], key=lambda item: item["score"], reverse=True)[:5]))


Unnamed: 0,input,actual_cat,all_predictions
3,Business Services,Other Office Services,"[{'Other Business Services': 94.05}, {'Real Es..."
8,Powerwashing,General R&M,"[{'Cleaning': 53.84}, {'Other Office Services'..."
11,Landscape Design and Installation,Roads & Grounds,"[{'General R&M': 67.51}, {'Roads & Grounds': 2..."
13,Data / Internet,Telecommunications,"[{'Other Office Services': 39.22}, {'Telecommu..."
23,Video Conferencing Providers,Other Office Services,"[{'Telecommunications': 65.69}, {'Other Office..."
...,...,...,...
201,Waste Management,Water & Sewer,"[{'Waste': 63.98}, {'Other Office Services': 1..."
204,N/R HVAC Contracts & Ma,HVAC,"[{'General R&M': 96.42}, {'HVAC': 1.41}, {'Con..."
210,Battery Recycling,Waste,"[{'Other Office Services': 67.98}, {'Electrica..."
213,HVAC Supplies,HVAC,"[{'General R&M': 45.06}, {'HVAC': 32.37}, {'Ot..."


In [49]:
multiple_predictions = multiple_predictions.explode('all_predictions')
multiple_predictions.head()

Unnamed: 0,input,actual_cat,all_predictions
3,Business Services,Other Office Services,{'Other Business Services': 94.05}
3,Business Services,Other Office Services,{'Real Estate Expense': 2.92}
3,Business Services,Other Office Services,{'Other Office Services': 0.95}
3,Business Services,Other Office Services,{'Telecommunications': 0.41}
3,Business Services,Other Office Services,{'Security': 0.37}


In [54]:
multiple_predictions['predicted_category'] = multiple_predictions['all_predictions'].apply(lambda x:str(x.keys()).replace('dict_keys([\'','').replace('\'])',''))
multiple_predictions['prediction_probability'] = multiple_predictions['all_predictions'].apply(lambda x:str(x.values()).replace('dict_values([','').replace('])',''))
multiple_predictions.head()

Unnamed: 0,input,actual_cat,all_predictions,predicted_category,prediction_probability
3,Business Services,Other Office Services,{'Other Business Services': 94.05},Other Business Services,94.05
3,Business Services,Other Office Services,{'Real Estate Expense': 2.92},Real Estate Expense,2.92
3,Business Services,Other Office Services,{'Other Office Services': 0.95},Other Office Services,0.95
3,Business Services,Other Office Services,{'Telecommunications': 0.41},Telecommunications,0.41
3,Business Services,Other Office Services,{'Security': 0.37},Security,0.37


In [55]:
multiple_predictions[['input', 'actual_cat','predicted_category','prediction_probability']].to_excel("New_Model_Test_Failed_5_3.xlsx", index=False)