In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW, TextClassificationPipeline
le = LabelEncoder()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=16)
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [2]:
class DatasetBuilding(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [3]:
# config params
data_path = "DATA/Property Type ML model Training Data 20230413.xlsx"

In [4]:
raw_data = pd.read_excel(data_path)#[['sub_type_1','major_cat']].rename(columns={'input':'input_phrase','property_type':'text_label'}).drop_duplicates().reset_index().drop('index', axis=1)
raw_data.head()

Unnamed: 0,input,property_type
0,Agribusiness,Agriculture
1,Agricultural,Agriculture
2,Agriculture,Agriculture
3,Agriculture - TBD,Agriculture
4,Aquaculture,Agriculture


In [5]:
raw_data['label'] = le.fit_transform(raw_data['property_type'])
raw_data.head()

Unnamed: 0,input,property_type,label
0,Agribusiness,Agriculture,0
1,Agricultural,Agriculture,0
2,Agriculture,Agriculture,0
3,Agriculture - TBD,Agriculture,0
4,Aquaculture,Agriculture,0


In [6]:
required_data = raw_data[['input','label']]

In [7]:
train_texts, test_texts, train_labels, test_labels = train_test_split(list(required_data['input']), list(required_data['label']), test_size=.1, shuffle=True)

In [8]:
#train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [32]:
#test_texts = list(required_data['input'])

In [33]:
#test_labels = list(required_data['label'])

In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
#val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [9]:
class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = BuildDataset(train_encodings, train_labels)
test_dataset = BuildDataset(test_encodings, test_labels)

In [10]:
model.train()
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)
for epoch in range(5):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()



DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [11]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)



In [12]:
import pickle
with open("embedding_model_4_20.pkl","wb") as f:
    pickle.dump(model, f)

In [13]:
import pickle
with open("label_enc_4_20.pkl","wb") as f:
    pickle.dump(le, f)

In [14]:
predicted_labels = []
for single_text in test_texts:
    predicted_labels.append(int(sorted(pipe(single_text)[0], key=lambda item: item["score"], reverse=True)[0]['label'].split('_')[1]))

In [15]:
res_dict = {'input':test_texts, 'predicted':predicted_labels, 'actual':test_labels}
evaluation_dataframe = pd.DataFrame(res_dict)
evaluation_dataframe['match'] = evaluation_dataframe['predicted']==evaluation_dataframe['actual']

In [16]:
correct_count = sum(evaluation_dataframe['match'].values)
actual_count = len(evaluation_dataframe)
accuracy = correct_count/actual_count
accuracy

0.9134615384615384

In [17]:
# Training 1 = 0.6444444444444445
# Training 2 = 0.7
# Training 3 = 0.6333333333333333
# Training 4 = 0.6888888888888889
# Training 5 = 0.6444444444444445

In [38]:
#(0.6444444444444445+0.7+0.6333333333333333+0.6888888888888889+0.6444444444444445)/5

In [18]:
evaluation_dataframe.to_excel('test_data_results_4_20.xlsx', index=False)

In [19]:
failed_data = evaluation_dataframe[evaluation_dataframe['match']==0]

In [20]:
failed_data['predicted_cat'] =failed_data['predicted'].apply(lambda x: le.inverse_transform([x])[0])
failed_data['actual_cat'] =failed_data['actual'].apply(lambda x: le.inverse_transform([x])[0])
failed_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  failed_data['predicted_cat'] =failed_data['predicted'].apply(lambda x: le.inverse_transform([x])[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  failed_data['actual_cat'] =failed_data['actual'].apply(lambda x: le.inverse_transform([x])[0])


Unnamed: 0,input,predicted,actual,match,predicted_cat,actual_cat
9,Ancillary Space,10,15,False,Office,Special Purpose
23,Coop,0,12,False,Agriculture,Residential
41,Call Center,13,10,False,Retail,Office
42,Themed,5,13,False,Hospitality,Retail
54,Grain Elevator,6,0,False,Industrial & Logistics,Agriculture


In [21]:
failed_data.to_excel('full_failed_data_4_20.xlsx', index=False)

In [22]:
evaluation_dataframe['predicted_cat'] =evaluation_dataframe['predicted'].apply(lambda x: le.inverse_transform([x])[0])
evaluation_dataframe['actual_cat'] =evaluation_dataframe['actual'].apply(lambda x: le.inverse_transform([x])[0])
evaluation_dataframe.head()

Unnamed: 0,input,predicted,actual,match,predicted_cat,actual_cat
0,Other Shopping Center (Attached),13,13,True,Retail,Retail
1,Special Purpose - TBD,15,15,True,Special Purpose,Special Purpose
2,Bed & Breakfast,5,5,True,Hospitality,Hospitality
3,Hybrid Car Wash,13,13,True,Retail,Retail
4,Distribution,6,6,True,Industrial & Logistics,Industrial & Logistics


In [23]:
evaluation_dataframe.to_excel('full_data_results_with_labels_4_20.xlsx', index=False)

In [24]:
def converter(label_list):
    new_list = []
    for val in label_list:
        new_dict = {}
        new_dict[le.inverse_transform([int(val['label'].split('_')[1])])[0]] = round(val['score']*100,2)
        new_list.append(new_dict)
    return new_list

In [25]:
converter(sorted(pipe("office")[0], key=lambda item: item["score"], reverse=True)[:5])

[{'Office': 98.17},
 {'Public Institution': 0.3},
 {'Healthcare': 0.24},
 {'Data Center': 0.23},
 {'Hospitality': 0.17}]

In [26]:
multiple_predictions = failed_data[['input','actual_cat']]
multiple_predictions['all_predictions'] = multiple_predictions['input'].apply(lambda x:converter(sorted(pipe(x)[0], key=lambda item: item["score"], reverse=True)[:5]))
multiple_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multiple_predictions['all_predictions'] = multiple_predictions['input'].apply(lambda x:converter(sorted(pipe(x)[0], key=lambda item: item["score"], reverse=True)[:5]))


Unnamed: 0,input,actual_cat,all_predictions
9,Ancillary Space,Special Purpose,"[{'Office': 88.15}, {'Residential': 4.48}, {'L..."
23,Coop,Residential,"[{'Agriculture': 44.47}, {'Residential': 27.94..."
41,Call Center,Office,"[{'Retail': 66.66}, {'Office': 17.08}, {'Hospi..."
42,Themed,Retail,"[{'Hospitality': 51.36}, {'Leisure': 38.8}, {'..."
54,Grain Elevator,Agriculture,"[{'Industrial & Logistics': 98.74}, {'Retail':..."
63,Agribusiness,Agriculture,"[{'Industrial & Logistics': 82.33}, {'Retail':..."
73,Accounting,Industrial & Logistics,"[{'Office': 91.39}, {'Data Center': 1.84}, {'I..."
74,Single-Care,Senior Housing,"[{'Healthcare': 94.83}, {'Senior Housing': 2.3..."
75,Multi-Let Estates,Industrial & Logistics,"[{'Land': 84.35}, {'Residential': 11.95}, {'Ag..."


In [27]:
multiple_predictions = multiple_predictions.explode('all_predictions')
multiple_predictions.head()

Unnamed: 0,input,actual_cat,all_predictions
9,Ancillary Space,Special Purpose,{'Office': 88.15}
9,Ancillary Space,Special Purpose,{'Residential': 4.48}
9,Ancillary Space,Special Purpose,{'Land': 2.07}
9,Ancillary Space,Special Purpose,{'Special Purpose': 1.73}
9,Ancillary Space,Special Purpose,{'Hospitality': 0.55}


In [28]:
multiple_predictions['predicted_category'] = multiple_predictions['all_predictions'].apply(lambda x:x.keys())
multiple_predictions['prediction_probability'] = multiple_predictions['all_predictions'].apply(lambda x:x.values())
multiple_predictions.head()

Unnamed: 0,input,actual_cat,all_predictions,predicted_category,prediction_probability
9,Ancillary Space,Special Purpose,{'Office': 88.15},(Office),(88.15)
9,Ancillary Space,Special Purpose,{'Residential': 4.48},(Residential),(4.48)
9,Ancillary Space,Special Purpose,{'Land': 2.07},(Land),(2.07)
9,Ancillary Space,Special Purpose,{'Special Purpose': 1.73},(Special Purpose),(1.73)
9,Ancillary Space,Special Purpose,{'Hospitality': 0.55},(Hospitality),(0.55)


In [29]:
multiple_predictions[['input', 'actual_cat','predicted_category','prediction_probability']].to_excel("New_Model_Test_Failed_4_20.xlsx", index=False)