# Imports

In [19]:
import os
import pandas as pd
import json
import torch
import sys
sys.path.append('../S3BERT/src')
from training_utils import create_weighted_avg, normalize_column, get_data, train_combined

In [20]:
import custom_losses
import custom_evaluators
from sentence_transformers import SentenceTransformer, InputExample
import model_freeze as freeze
import prediction_helpers as ph
import ast
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from transformers import AutoModel
from training_utils import get_data, create_weighted_avg, normalize_column, train, infer, get_mse_scores, get_spearman_scores
from training_utils import prepare_for_training as similarity_prepare_for_training
from classification_training_utils import prepare_for_training as classification_prepare_for_training, get_big_consulting_df, collect_classification_labels, get_top_values, get_relevant_classifications, filter_relevant_classifications
from sentence_transformers import losses
from sentence_transformers import evaluation
from utils import load_model

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

print(torch.cuda.device_count())
print("Device: ", device)

0
Device:  cpu


# Training

In [21]:
params = {}
params["EPOCHS"] = 3
params["SAVE_PATH"] = "outputs/" #"s3bert_" + params["SBERT_INIT"] + "/
params["BATCH_SIZE"] = 32
params["WARMUP_STEPS"] = 10
params["EVAL_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5

# similarity training specific
params["USE_WEIGHTED_SIMILARITY"] = True
params["USE_MANUAL_LABELS"] = True # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["cosine"] # ["consistency", "distill", "distill2", "cosine"]
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
    params["FEATURES"] += ['company', 'country', 'classification', 'keywords']
# FEATURES += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["FEATURE_DIM"] = 16 if params["USE_DISTILL"] else 0

# classification training specific
params["USE_REPLACE_DATA"] = False
params["USE_ORIGINAL_DATA"] = True
params["UNFREEZE_LAYERS"] = 2
params["EXCLUDE_ENTITY_OTHER"] = True
params["INITIALIZED_MODEL"] = None #"brjezierski/S3BERT" # "intfloat/e5-small-v2"
params["OCCURENCE_CUTOFF"] = 2
params["CREATE_NEW_SPLIT"] = False

dataset_dir = "../classification-training-data/"
model = load_model() if not params["INITIALIZED_MODEL"] else load_model(
        model=params["INITIALIZED_MODEL"])

In [22]:
column_list = ['country', 'company', 'keywords', 'classification']
column_proportions = (10, 2, 1, 1, 1)
similarity_train_df, similarity_val_df, similarity_test_df = get_data(column_list, column_proportions, params, current_prefix='../similarity-training-data/sbert-company/')
similarity_train_df[:1]

Unnamed: 0,snippet1,snippet2,similarity,country_similarity,country1,country2,classification_similarity,classification1,classification2,keywords_similarity,keywords1,keywords2,company_similarity,company1,company2
0,at KPMG where he focused on complex financial ...,LatentView has been recognized as an industry ...,0.296189,0.536794,US,IN,0.347584,"['strategy', 'company info']",['leadership'],0.299588,['he'],"['leader', 'industry leader']",0.632063,KPMG A/S,Gartner Inc.


In [23]:
big_consulting_df = get_big_consulting_df(params)
big_consulting_df = collect_classification_labels(big_consulting_df, verbose=True)
big_consulting_df = get_top_values(big_consulting_df, params)
all_classifications, relevant_classifications = get_relevant_classifications(big_consulting_df, params)
big_consulting_df, classifications = filter_relevant_classifications(big_consulting_df, all_classifications, relevant_classifications)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].loc[df[column].isnull()] = df[column].loc[df[column].isnull()


['Penalties', 'Supply Deals', 'Market Regions', 'Product Investments', 'Critics', 'Deals', 'Financial Problems', 'Expanding Facilities', 'Efficiency', 'Events', 'Grow', 'Procure Supply', 'Funiding Investment', 'Fighting Abuse', 'Market Coverage', 'Selling Assets', 'Markets', 'Layoff Staff', 'Products', 'Product Launch', 'Hosting Events', 'Partner for Event', 'Operational Regions', 'Resellers', 'PR', 'Product Growth', 'Investments', 'Award-Sponsoring', 'Mergers', 'Coworks', 'Product Advertising', 'Service Agreements', 'Strategy', 'Using', 'Company Info', 'Decline', 'Market Share', 'Internals', 'Celebrations', 'IP Penalties', 'Deliver', 'Activities', 'Solved Problems', 'Promotions', 'Law Infringement', 'Being Rated Financially', 'Employee Growth', 'Finished Sueings', 'Competition', 'Product Stop', 'Milestones', 'Unit Sold', 'Awards', 'Leadership', 'Corporate Reports', 'Founding', 'Received Investment', 'Outgoing Employees', 'Regulatory', 'Launches', 'Mediascore', 'Closing Facilities', 'B

In [24]:
similarity_train_objectives, similarity_evaluator = similarity_prepare_for_training(model, params, similarity_train_df, similarity_val_df)

classification_train_objectives, dev_classification_evaluator, test_similarity_evaluator = classification_prepare_for_training(big_consulting_df, classifications, params, dataset_dir, model)
# similarity_train_objectives = classification_train_objectives
# dev_classification_evaluator = dev_classification_evaluator

['similarity']
['similarity']
e=3 Using original data, unfreezing 2 last layers, excluding Entity and Other labels, only including words that occur at least 2 times

Training data size 2907
Validation data size 100
Test data size 100


In [25]:
train_combined(classification_train_objectives, similarity_train_objectives, similarity_evaluator, dev_classification_evaluator, params, model)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

Score at epoch 0, step -1: 1.7839489007658331


Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

Score at epoch 1, step -1: 1.8339126088867568


Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

Score at epoch 2, step -1: 1.8445235221845468


(None,
 SentenceTransformer(
   (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
   (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
   (2): Normalize()
 ))

In [8]:
# 1.8356546190824357
