In [62]:
import importlib
import classification_training_utils
importlib.reload(classification_training_utils)
import utils
importlib.reload(utils)

import sys
sys.path.append('../')

import pandas as pd
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util
from sentence_transformers.datasets import SentenceLabelDataset
from sentence_transformers.readers import InputExample
from sentence_transformers.evaluation import TripletEvaluator
from sklearn.metrics.pairwise import cosine_similarity
from utils import load_model, replace_nan_with, load_big_consulting_export, callback
import numpy as np
import random
from torch.utils.data import DataLoader
from datetime import datetime
import os
from collections import defaultdict
import S3BERT.src.model_freeze as freeze
import matplotlib.pyplot as plt
plt.style.use('ggplot')


from classification_training_utils import get_big_consulting_df, collect_classification_labels, get_relevant_classifications, train, filter_relevant_classifications, get_top_values, get_news_df
from transformers import EarlyStoppingCallback
import math
import torch
from tqdm import tqdm
tqdm.pandas()
import pickle

In [63]:
params = {}
params["EPOCHS"] = 2
params["USE_REPLACE_DATA"] = False
params["USE_ORIGINAL_DATA"] = True
params["UNFREEZE_LAYERS"] = 2
params["EXCLUDE_ENTITY_OTHER"] = True
params["INITIALIZED_MODEL"] = None #"brjezierski/S3BERT" # "intfloat/e5-small-v2"
params["OCCURENCE_CUTOFF"] = 2
params["CREATE_NEW_SPLIT"] = False
params["BATCH_SIZE"] = 32

dataset_dir = "../classification-training-data/"
sbert_model = load_model() if not params["INITIALIZED_MODEL"] else load_model(
        model=params["INITIALIZED_MODEL"])

In [64]:
def load_ai_news():
    prefix = '../glanos-data/datasets/'
    tsv_path = f"{prefix}ai_news.tsv"

    df = pd.read_csv(tsv_path, delimiter='\t')
    df.dropna(subset=['snippet'], inplace=True)
    df = df.sort_values(['snippet', 'classification'])
    df.drop_duplicates(subset=['snippet'], keep='first', inplace=True)
    df = df.sort_index().reindex()

    return df

def load_car_news():
    prefix = '../glanos-data/datasets/'
    tsv_path = f"{prefix}car_news.tsv"

    df = pd.read_csv(tsv_path, delimiter='\t')
    df.dropna(subset=['snippet'], inplace=True)
    df = df.sort_values(['snippet', 'classification'])
    df.drop_duplicates(subset=['snippet'], keep='first', inplace=True)
    df = df.sort_index().reindex()

    return df

def get_news_df(params, dataset_name):
    prefix = '../glanos-data/embeddings/'
    replace_data_path = f'{prefix}{dataset_name}_replace.pickle'
    original_data_path = f'{prefix}{dataset_name}_snippet.pickle'

    sbert_model = load_model()

    def encode_w_sbert(snippet):
        return sbert_model.encode(snippet)

    news_df = load_ai_news() if dataset_name == 'ai_news' else load_car_news()
    if params["USE_ORIGINAL_DATA"] and params["USE_REPLACE_DATA"] and os.path.exists(replace_data_path):
        print("Not supported")
        return None
        # with open(original_data_path, 'rb') as f:
        #     news_df_r = pickle.load(f)
        # news_df = pd.concat(
        #     [news_df, news_df_r], axis=0).reset_index(drop=True)
    elif params["USE_ORIGINAL_DATA"]:
        if os.path.exists(original_data_path):
            with open(original_data_path, 'rb') as f:
                original_dict = pickle.load(f)
            news_df['embedding'] = news_df['snippet'].map(original_dict)
        else:
            news_df['embedding'] = news_df.progress_apply(
                lambda row: encode_w_sbert(row['snippet']), axis=1)
    elif params["USE_REPLACE_DATA"]:
        if os.path.exists(replace_data_path):
            with open(replace_data_path, 'rb') as f:
                replace_dict = pickle.load(f)
                print(list(replace_dict.keys())[:10])
            news_df['embedding'] = news_df['replace'].map(replace_dict)
        else:
            news_df['embedding'] = news_df.progress_apply(
                lambda row: encode_w_sbert(row['replace']), axis=1)
    else:
        print("Not supported")
        return None

    news_df.dropna(subset=['embedding'], inplace=True)
    return news_df

In [60]:
big_consulting_df = get_big_consulting_df(params)
ai_news_df = get_news_df(params, 'ai_news')
car_news_df = get_news_df(params, 'car_news')
df = pd.concat([big_consulting_df, ai_news_df, car_news_df], axis=0).reset_index(drop=True)
len(ai_news_df), len(car_news_df)

(262523, 177908)

In [41]:
df = collect_classification_labels(df, verbose=True)



In [42]:
df = get_top_values(df, params)
df

Unnamed: 0,date,company,classification,snippet,replace,url,embedding,keywords,top_classification
0,2023-06-04,IBM Corp.,[ENTITY],IBM’s CEO recently said it would pause hiring ...,"In fact, #COMPANY's #JOBTITLE recently said it...",https://app.news-monitor.ai/shared/news/newsId...,"[-0.035979703, -0.063728906, 0.0027139399, -0....",,
1,2023-06-04,IBM Corp.,[ENTITY],"In fact, IBM’s CEO recently said","In fact, #COMPANY's #JOBTITLE recently said it...",https://app.news-monitor.ai/shared/news/newsId...,"[-0.035979703, -0.063728906, 0.0027139399, -0....",,
2,2023-06-04,Infosys Limited,"[OTHER, ENTITY]",IT services giant Infosys Ltd. appointed Helen...,IT services giant #COMPANY appointed #PERSON P...,https://app.news-monitor.ai/shared/news/newsId...,"[-0.035946183, 0.077817865, -0.0075995466, -0....",,
3,2023-06-04,IBM Corp.,"[OTHER, ENTITY]","such as Polkadot, Pantera, CoinDesk, The Graph...","such as Polkadot, #COMPANY, #COMPANY, The Grap...",https://app.news-monitor.ai/shared/news/newsId...,"[-0.053204842, -0.07888795, 0.028639238, -0.03...",,
4,2023-06-04,Wipro Limited,[ENTITY],"Wipro Ltd has fixed Friday, June 16, 2023 as t...","#COMPANY has fixed Friday, #DATE as the Record...",https://app.news-monitor.ai/shared/news/newsId...,"[-0.1023133, 0.04731851, -0.03809882, -0.01436...",,
...,...,...,...,...,...,...,...,...,...
450430,,,[ENTITY],BMW’s premium hatchback will get a complete re...,#COMPANY’s premium hatchback will get a comple...,,"[-0.04795007, 0.07876511, 0.040450905, 0.00488...",bmw,
450431,,,[ENTITY],Volkswagen has announced a new limited edition...,#COMPANY has announced a new limited edition v...,,"[-0.044824775, 0.10501017, 0.14470142, 0.00046...",volkswagen,
450432,,,[ENTITY],"Volkswagen’s premium SUV, the Touareg, is now ...","#COMPANY’s premium SUV, the Touareg, is now on...",,"[-0.0110210255, 0.06328671, -0.011485885, 0.03...",volkswagen,
450433,,,[ENTITY],"The VW Touareg is available in Elegance, Black...","The #COMPANY Touareg is available in Elegance,...",,"[-0.088718735, -0.006033144, 0.00040222268, 0....",volkswagen,


In [43]:
all_classifications, relevant_classifications = get_relevant_classifications(df, params)
relevant_classifications

Share of rows with a non-empty list in 'classification': 109422
Share of rows with a list of len 2 in 'classification': 601
# of rows with a non-empty list in top_classification: 109422
product 23000
employ 22635
state 11806
productlaunch 5769
partner 4071
use 3471
growneutral 3188
invest 2945
hq_loc 1969
tech 1950
op_loc 1833
part_of 1376
sell 1196
grow 1104
funding 1102
supply 1040
employstart 1019
growmarket 975
regulatory 967
leadership 924
market_loc 917
problem 880
techpatent 770
investor 759
acquire 692
employstop 624
general 623
pr 595
founder 582
deal 575
sue 573
agreement 553
service 524
report 451
productadv 451
market 430
growprod 407
staffinglayoffs 387
deliver 376
launchfacility 358
efficiency 351
rights 315
decline 278
productstop 271
event 258
career 226
compete 208
staffing 197
award 172
launch 168
milestone 160
owner 157
financial 156
marketcoverage 151
growforecast 148
prod_loc 133
competence 124
monetary 108
success 89
marketshare 83
charity 83
cowork 81
mediascore 

['product',
 'employ',
 'state',
 'productlaunch',
 'partner',
 'use',
 'growneutral',
 'invest',
 'hq_loc',
 'tech',
 'op_loc',
 'part_of',
 'sell',
 'grow',
 'funding',
 'supply',
 'employstart',
 'growmarket',
 'regulatory',
 'leadership',
 'market_loc',
 'problem',
 'techpatent',
 'investor',
 'acquire',
 'employstop',
 'general',
 'pr',
 'founder',
 'deal',
 'sue',
 'agreement',
 'service',
 'report',
 'productadv',
 'market',
 'growprod',
 'staffinglayoffs',
 'deliver',
 'launchfacility',
 'efficiency',
 'rights',
 'decline',
 'productstop',
 'event',
 'career',
 'compete',
 'staffing',
 'award',
 'launch',
 'milestone',
 'owner',
 'financial',
 'marketcoverage',
 'growforecast',
 'prod_loc',
 'competence',
 'monetary',
 'success',
 'marketshare',
 'charity',
 'cowork',
 'mediascore',
 'sellto',
 'sued',
 'closefacility',
 'hire',
 'eventhost',
 'awardspons',
 'tender',
 'growfacility',
 'corp_report',
 'sueend',
 'internal',
 'abuselaw',
 'acquireasset',
 'negotiate',
 'problems

In [44]:
df, classifications = filter_relevant_classifications(df, all_classifications, relevant_classifications)
params["WARMUP_STEPS"] = int(len(df) * params["EPOCHS"] * 0.1)  # 10% of train data
df

Unnamed: 0,index,date,company,classification,snippet,replace,url,embedding,keywords,top_classification,id
0,5,2023-06-04,Aon PLC Holding,"[PARTNER, ENTITY, PRODUCT]",Aon has announced a partnership with Praedicat...,#COMPANY has announced a partnership with #COM...,https://app.news-monitor.ai/shared/news/newsId...,"[-0.076906666, -0.07068141, -0.011610352, 0.02...",,partner,0
1,6,2023-06-04,Aon PLC Holding,[PARTNER],Praedicat is delighted to expand our collabora...,#COMPANY is delighted to expand our collaborat...,https://app.news-monitor.ai/shared/news/newsId...,"[-0.07951004, 0.0024159628, 0.036889132, 0.002...",,partner,1
2,8,2023-06-04,IBM Corp.,"[SUPPLY, COWORK, GENERAL]",working with IBM Consulting to build a convers...,"For example, working with #COMPANY to build a ...",https://app.news-monitor.ai/shared/news/newsId...,"[-0.051184747, -0.06896829, -0.00018741902, -0...",,supply,2
3,10,2023-06-04,IBM Corp.,"[PRODUCT, STATE]","IBM is not obligated to provide any material, ...",#COMPANY is not obligated to provide any mater...,https://app.news-monitor.ai/shared/news/newsId...,"[-0.029711233, 0.053124078, 0.0028441548, -0.0...",,product,3
4,19,2023-06-03,IBM Corp.,[STAFFING],IBM's CEO said it was halting hiring for almos...,"Within weeks, #COMPANY's #JOBTITLE said it was...",https://app.news-monitor.ai/shared/news/newsId...,"[-0.041291106, -0.063835, -0.021371098, -0.039...",,staffing,4
...,...,...,...,...,...,...,...,...,...,...,...
109110,450402,,,[ACQUIRE],While it might prove difficult to persuade buy...,While it might prove difficult to persuade buy...,,"[0.046328686, 0.040803503, 0.018272437, -0.034...",audi|bmw,acquire,109110
109111,450415,,,[AWARD],The Hyundai Tucson bagged our Best Family Car ...,The #COMPANY bagged our Best Family Car at the...,,"[-0.004783157, 0.15697657, 0.08591719, -0.0348...",hyundai,award,109111
109112,450419,,,[PRODUCT-LAUNCH],"Audi introduced further updates in 2020, with ...","#COMPANY introduced further updates in #DATE, ...",,"[-0.07528738, 0.04278491, 0.041177336, 0.03218...",audi,productlaunch,109112
109113,450425,,,[PARTNER],The function is currently limited to a handfu...,"a selection of recent #COMPANY models, but it’...",,"[-0.030761916, 0.02911859, 0.042076293, -0.026...",hyundai|volkswagen,partner,109113


In [9]:
# ai_news_df = collect_classification_labels(ai_news_df, verbose=True)
# ai_news_df = get_top_values(ai_news_df, params)
# ai_news_df, ai_news_classifications = filter_relevant_classifications(ai_news_df, all_classifications, relevant_classifications)

# car_news_df = collect_classification_labels(car_news_df, verbose=True)
# car_news_df = get_top_values(car_news_df, params)
# car_news_df, car_news_classifications = filter_relevant_classifications(car_news_df, all_classifications, relevant_classifications)





In [48]:
model_fit, test_evaluator, sbert_model = train(df, classifications, params, dataset_dir, sbert_model)
# model_fit, test_evaluator, sbert_model = train([ai_news_df, car_news_df], [ai_news_classifications, ai_news_classifications], params, dataset_dir, sbert_model)

# Accuracy Cosine Distance
# baseline - 0.77
# e=2 Using original data - 0.83
# BEST e=2 Using original data, unfreezing 2 last layers, only including words that occur at least 2 times - 0.93
# e=2 Using original data, unfreezing 2 last layers, only including words that occur at least 3 times - 0.93
# e=2 Using original data, unfreezing 2 last layers, only including words that occur at least 4 times - 0.92
# e=1 Using original data, unfreezing 2 last layers, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.89
# e=2 Using original data, unfreezing 2 last layers, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.92

# e=2 Using original and replacement data, unfreezing 2 last layers, excluding Entity and Other labels, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.91
# e=2 Using original and replacement data, unfreezing 2 last layers, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.88
# e=1 Using original and replacement data, unfreezing 2 last layers, excluding Entity and Other labels, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.92

# e=2 Using original data, unfreezing 2 last layers, starting with intfloat/e5-small-v2, only including words that occur at least 2 times - 0.91
# e=2 Using original and replacement data, unfreezing 2 last layers, excluding Entity and Other labels, starting with intfloat/e5-small-v2, only including words that occur at least 2 times - 0.89

# e=2 Using replacement data, creating a new train-dev-test split, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.96, 0.88 (baseline)
# e=1 Using replacement data, creating a new train-dev-test split, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.94
# e=2 Using replacement data, creating a new train-dev-test split, excluding Entity and Other labels, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.94
# e=1 Using replacement data, creating a new train-dev-test split, excluding Entity and Other labels, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.96

# e=1 Using replacement data, unfreezing 2 last layers, excluding Entity and Other labels, only including words that occur at least 2 times, training data size 67325+38639

e=2 Using replacement data, creating a new train-dev-test split, unfreezing 2 last layers, excluding Entity and Other labels, only including words that occur at least 2 timesTraining data size108915

Training data size 108915
Validation data size 100
Test data size 100
Performance before fine-tuning:


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3403 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.82


Iteration:   0%|          | 0/3403 [00:00<?, ?it/s]

In [65]:
big_consulting_df = get_big_consulting_df(params)
df = collect_classification_labels(big_consulting_df, verbose=True)
df = get_top_values(df, params)
all_classifications, relevant_classifications = get_relevant_classifications(df, params)
df, classifications = filter_relevant_classifications(df, all_classifications, relevant_classifications)
params["WARMUP_STEPS"] = int(len(df) * params["EPOCHS"] * 0.1)  # 10% of train data
model_fit, test_evaluator, sbert_model = train(df, classifications, params, dataset_dir, sbert_model)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].loc[df[column].isnull()] = df[column].loc[df[column].isnull()


['Markets', 'Layoff', 'Monetary', 'Using Products', 'External Hirings', 'Employee Growth', 'Product Growth', 'Corporate Reports', 'Being used', 'Ownerships', 'Coworks', 'Successes', 'Critics', 'Publishing Numbers', 'Unit Acquistion', 'Being sold', 'Events', 'Mandate service', 'Activities', 'Insolvency', 'Resellers', 'Product Investments', 'New Employees', 'Rights', 'IP Penalties', 'Employee Careers', 'Technology', 'Product Advertising', 'Competition', 'Products', 'Investments', 'Charity', 'Procure Supply', 'Facility Launches', 'Celebrations', 'Market Coverage', 'Promotions', 'Acquired', 'Launches', 'Leadership', 'Operations', 'Layoff Staff', 'Acquistions', 'Product Problems', 'Employee Misconduct', 'Launched Operations', 'Founding', 'Service Agreements', 'Financial Problems', 'Deals', 'Grow', 'Employees', 'Fighting Abuse', 'Failed Competence', 'Using', 'Staffing', 'Milestones', 'Hosting Events', 'Product Stop', 'Company Info', 'Market Share', 'Penalties', 'Operational Regions', 'Patent

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/97 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.8


Iteration:   0%|          | 0/97 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.89


In [66]:
sbert_model.evaluate(test_evaluator)

0.89

In [67]:
frozen_model = load_model()
frozen_model.evaluate(test_evaluator)

0.77

In [None]:
# 0.93 0.77