In [26]:
import importlib
import classification_training_utils
importlib.reload(classification_training_utils)
import utils
importlib.reload(utils)

import sys
sys.path.append('../')

import pandas as pd
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util
from sentence_transformers.datasets import SentenceLabelDataset
from sentence_transformers.readers import InputExample
from sentence_transformers.evaluation import TripletEvaluator
from sklearn.metrics.pairwise import cosine_similarity
from utils import load_model, replace_nan_with, load_big_consulting_export, callback
import numpy as np
import random
from torch.utils.data import DataLoader
from datetime import datetime
import os
from collections import defaultdict
import S3BERT.src.model_freeze as freeze
import matplotlib.pyplot as plt
plt.style.use('ggplot')


from classification_training_utils import get_big_consulting_df, collect_classification_labels, get_relevant_classifications, train, filter_relevant_classifications, get_top_values, get_news_df
from utils import create_replace_no_tags_embeddings
from transformers import EarlyStoppingCallback
import math
import torch
from tqdm import tqdm
tqdm.pandas()
import pickle

In [27]:
params = {}
params["EPOCHS"] = 4
params["USE_REPLACE_DATA"] = True
params["USE_ORIGINAL_DATA"] = False
params["UNFREEZE_LAYERS"] = 2
params["EXCLUDE_ENTITY_OTHER"] = True
params["INITIALIZED_MODEL"] = None #"brjezierski/S3BERT" # "intfloat/e5-small-v2"
params["OCCURENCE_CUTOFF"] = 2
params["CREATE_NEW_SPLIT"] = True
params["BATCH_SIZE"] = 32
params["VAL_DEV_SIZE"] = 400
params['SNIPPET_COLUMN_NAME'] = 'replace_no_tags'

dataset_dir = "../classification-training-data/15-9-23/"
sbert_model = load_model() if not params["INITIALIZED_MODEL"] else load_model(
        model=params["INITIALIZED_MODEL"])

In [28]:
def load_ai_news():
    prefix = '../glanos-data/datasets/'
    tsv_path = f"{prefix}ai_news.tsv"

    df = pd.read_csv(tsv_path, delimiter='\t')
    df.dropna(subset=['snippet'], inplace=True)
    df = df.sort_values(['snippet', 'classification'])
    df.drop_duplicates(subset=['snippet'], keep='first', inplace=True)
    df = df.sort_index().reindex()

    return df

def load_car_news():
    prefix = '../glanos-data/datasets/'
    tsv_path = f"{prefix}car_news.tsv"

    df = pd.read_csv(tsv_path, delimiter='\t')
    df.dropna(subset=['snippet'], inplace=True)
    df = df.sort_values(['snippet', 'classification'])
    df.drop_duplicates(subset=['snippet'], keep='first', inplace=True)
    df = df.sort_index().reindex()

    return df

def get_news_df(params, dataset_name):
    prefix = '../glanos-data/embeddings/'
    replace_data_path = f'{prefix}{dataset_name}_replace.pickle'
    original_data_path = f'{prefix}{dataset_name}_snippet.pickle'

    sbert_model = load_model()

    def encode_w_sbert(snippet):
        return sbert_model.encode(snippet)

    news_df = load_ai_news() if dataset_name == 'ai_news' else load_car_news()
    if params["USE_ORIGINAL_DATA"] and params["USE_REPLACE_DATA"] and os.path.exists(replace_data_path):
        print("Not supported")
        return None
        # with open(original_data_path, 'rb') as f:
        #     news_df_r = pickle.load(f)
        # news_df = pd.concat(
        #     [news_df, news_df_r], axis=0).reset_index(drop=True)
    elif params["USE_ORIGINAL_DATA"]:
        if os.path.exists(original_data_path):
            with open(original_data_path, 'rb') as f:
                original_dict = pickle.load(f)
            news_df['embedding'] = news_df['snippet'].map(original_dict)
        else:
            news_df['embedding'] = news_df.progress_apply(
                lambda row: encode_w_sbert(row['snippet']), axis=1)
    elif params["USE_REPLACE_DATA"]:
        if os.path.exists(replace_data_path):
            with open(replace_data_path, 'rb') as f:
                replace_dict = pickle.load(f)
                print(list(replace_dict.keys())[:10])
            news_df['embedding'] = news_df['replace'].map(replace_dict)
        else:
            news_df['embedding'] = news_df.progress_apply(
                lambda row: encode_w_sbert(row['replace']), axis=1)
    else:
        print("Not supported")
        return None

    news_df.dropna(subset=['embedding'], inplace=True)
    return news_df

In [37]:
big_consulting_df = get_big_consulting_df(params)
ai_news_df = get_news_df(params, 'ai_news')
car_news_df = get_news_df(params, 'car_news')
embeddings_prefix = '../glanos-data/embeddings/'
with open(f'{embeddings_prefix}big_consulting_2_replace_no_tags.pickle', 'rb') as f:
    replace_no_tags_embeddings = pickle.load(f)
with open(f'{embeddings_prefix}car_news_replace_no_tags.pickle', 'rb') as f:
    car_news_no_tags_embeddings = pickle.load(f)
with open(f'{embeddings_prefix}ai_news_replace_no_tags.pickle', 'rb') as f:
    ai_news_no_tags_embeddings = pickle.load(f)

big_consulting_df = create_replace_no_tags_embeddings(big_consulting_df, replace_no_tags_embeddings)
big_consulting_df['embedding'] = big_consulting_df['replace_no_tags'].map(replace_no_tags_embeddings)

ai_news_df = create_replace_no_tags_embeddings(ai_news_df, ai_news_no_tags_embeddings)
ai_news_df['embedding'] = ai_news_df['replace_no_tags'].map(ai_news_no_tags_embeddings)

car_news_df = create_replace_no_tags_embeddings(car_news_df, car_news_no_tags_embeddings)
car_news_df['embedding'] = car_news_df['replace_no_tags'].map(car_news_no_tags_embeddings)

df = pd.concat([big_consulting_df, ai_news_df, car_news_df], axis=0).reset_index(drop=True)
len(ai_news_df), len(car_news_df), len(big_consulting_df) # big_consulting_df is short 

(67514, 38834, 2744)

In [31]:
df = collect_classification_labels(df, verbose=False)
df = get_top_values(df, params)

In [18]:
all_classifications, relevant_classifications = get_relevant_classifications(df, params)
relevant_classifications

Share of rows with a non-empty list in 'classification': 109422
Share of rows with a list of len 2 in 'classification': 602
# of rows with a non-empty list in top_classification: 109422
product 22976
employ 22635
state 11820
productlaunch 5762
partner 4078
use 3472
growneutral 3188
invest 2949
hq_loc 1969
tech 1951
op_loc 1828
part_of 1376
sell 1196
grow 1104
funding 1102
supply 1040
employstart 1020
growmarket 975
regulatory 967
leadership 924
market_loc 911
problem 880
techpatent 770
investor 759
acquire 692
general 626
employstop 624
pr 596
founder 582
deal 578
sue 573
agreement 553
service 522
report 451
productadv 451
market 430
growprod 411
staffinglayoffs 387
deliver 376
launchfacility 357
efficiency 353
rights 315
decline 278
productstop 271
event 258
career 226
compete 208
staffing 197
award 172
launch 168
milestone 160
owner 157
financial 156
marketcoverage 151
growforecast 148
prod_loc 133
competence 124
monetary 108
success 89
charity 83
marketshare 83
mediascore 81
cowork 

['product',
 'employ',
 'state',
 'productlaunch',
 'partner',
 'use',
 'growneutral',
 'invest',
 'hq_loc',
 'tech',
 'op_loc',
 'part_of',
 'sell',
 'grow',
 'funding',
 'supply',
 'employstart',
 'growmarket',
 'regulatory',
 'leadership',
 'market_loc',
 'problem',
 'techpatent',
 'investor',
 'acquire',
 'general',
 'employstop',
 'pr',
 'founder',
 'deal',
 'sue',
 'agreement',
 'service',
 'report',
 'productadv',
 'market',
 'growprod',
 'staffinglayoffs',
 'deliver',
 'launchfacility',
 'efficiency',
 'rights',
 'decline',
 'productstop',
 'event',
 'career',
 'compete',
 'staffing',
 'award',
 'launch',
 'milestone',
 'owner',
 'financial',
 'marketcoverage',
 'growforecast',
 'prod_loc',
 'competence',
 'monetary',
 'success',
 'charity',
 'marketshare',
 'mediascore',
 'cowork',
 'sellto',
 'sued',
 'closefacility',
 'hire',
 'eventhost',
 'awardspons',
 'tender',
 'growfacility',
 'corp_report',
 'sueend',
 'internal',
 'abuselaw',
 'acquireasset',
 'negotiate',
 'problems

In [19]:
df, classifications = filter_relevant_classifications(df, all_classifications, relevant_classifications)
params["WARMUP_STEPS"] = int(len(df) * params["EPOCHS"] * 0.1)  # 10% of train data


In [49]:
big_consulting_df['replace']


0       #COMPANY has announced a partnership with #COM...
1       #COMPANY is delighted to expand our collaborat...
2       For example, working with #COMPANY to build a ...
3       #COMPANY is not obligated to provide any mater...
4       Within weeks, #COMPANY's #JOBTITLE said it was...
                              ...                        
2739    #COMPANY announced its acquisition of myInvenio,.
2740    #COMPANY and #COMPANY         announced the la...
2741    #COMPANY today announced a partnership between...
2742    While #COMPANY recently launched a new push to...
2743    #COMPANY makes up #PERCENT of #COMPANY's portf...
Name: replace, Length: 2744, dtype: object

In [38]:
ai_news_df = collect_classification_labels(ai_news_df)
ai_news_df = get_top_values(ai_news_df, params)
ai_news_all_classifications, ai_news_relevant_classifications = get_relevant_classifications(ai_news_df, params)
ai_news_df, ai_news_classifications = filter_relevant_classifications(ai_news_df, ai_news_all_classifications, ai_news_relevant_classifications)

# How about duplicating car news dataset?
car_news_df = collect_classification_labels(car_news_df)
car_news_df = get_top_values(car_news_df, params)
car_news_all_classifications, car_news_relevant_classifications = get_relevant_classifications(car_news_df, params)
car_news_df, car_news_classifications = filter_relevant_classifications(car_news_df, car_news_all_classifications, car_news_relevant_classifications)

big_consulting_df = collect_classification_labels(big_consulting_df)
big_consulting_df = get_top_values(big_consulting_df, params)
big_consulting_all_classifications, big_consulting_relevant_classifications = get_relevant_classifications(big_consulting_df, params)
big_consulting_df, big_consulting_classifications = filter_relevant_classifications(big_consulting_df, big_consulting_all_classifications, big_consulting_relevant_classifications)

params["WARMUP_STEPS"] = int(len(car_news_df) * params["EPOCHS"] * 0.1)

In [39]:
# model_fit, test_evaluator, sbert_model = train(df, classifications, params, dataset_dir, sbert_model)
model_fit, test_evaluator, sbert_model = train([ai_news_df, car_news_df], [ai_news_classifications, car_news_classifications], params, dataset_dir, sbert_model)

# Accuracy Cosine Distance
# baseline - 0.77
# e=2 Using original data - 0.83
# BEST e=2 Using original data, unfreezing 2 last layers, only including words that occur at least 2 times - 0.93
# e=2 Using original data, unfreezing 2 last layers, only including words that occur at least 3 times - 0.93
# e=2 Using original data, unfreezing 2 last layers, only including words that occur at least 4 times - 0.92
# e=1 Using original data, unfreezing 2 last layers, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.89
# e=2 Using original data, unfreezing 2 last layers, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.92

# e=2 Using original and replacement data, unfreezing 2 last layers, excluding Entity and Other labels, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.91
# e=2 Using original and replacement data, unfreezing 2 last layers, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.88
# e=1 Using original and replacement data, unfreezing 2 last layers, excluding Entity and Other labels, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.92

# e=2 Using original data, unfreezing 2 last layers, starting with intfloat/e5-small-v2, only including words that occur at least 2 times - 0.91
# e=2 Using original and replacement data, unfreezing 2 last layers, excluding Entity and Other labels, starting with intfloat/e5-small-v2, only including words that occur at least 2 times - 0.89

# e=2 Using replacement data, creating a new train-dev-test split, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.96, 0.88 (baseline)
# e=1 Using replacement data, creating a new train-dev-test split, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.94
# e=2 Using replacement data, creating a new train-dev-test split, excluding Entity and Other labels, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.94
# e=1 Using replacement data, creating a new train-dev-test split, excluding Entity and Other labels, starting with brjezierski/S3BERT, only including words that occur at least 2 times - 0.96

# e=1 Using replacement data, unfreezing 2 last layers, excluding Entity and Other labels, only including words that occur at least 2 times, training data size 67325+38639

Training with multiple objectives
e=4 Using replacement data, creating a new train-dev-test split, unfreezing 2 last layers, excluding Entity and Other labels, only including words that occur at least 2 times, training data size66714

Training data size 66714
Validation data size 100
Test data size 100
e=4 Using replacement data, creating a new train-dev-test split, unfreezing 2 last layers, excluding Entity and Other labels, only including words that occur at least 2 times, training data size38034

Training data size 38034
Validation data size 100
Test data size 100
Performance before fine-tuning:


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1188 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.71


Iteration:   0%|          | 0/1188 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.81


Iteration:   0%|          | 0/1188 [00:00<?, ?it/s]

Score at epoch 2, step -1: 0.85


Iteration:   0%|          | 0/1188 [00:00<?, ?it/s]

Score at epoch 3, step -1: 0.85


In [42]:
sbert_model.evaluate(test_evaluator)

0.82

In [43]:
frozen_model = load_model()
frozen_model.evaluate(test_evaluator)

0.8

In [None]:
# 0.93 0.77

In [65]:
# Just the original one
big_consulting_df = get_big_consulting_df(params)
df = collect_classification_labels(big_consulting_df, verbose=True)
df = get_top_values(df, params)
all_classifications, relevant_classifications = get_relevant_classifications(df, params)
df, classifications = filter_relevant_classifications(df, all_classifications, relevant_classifications)
params["WARMUP_STEPS"] = int(len(df) * params["EPOCHS"] * 0.1)  # 10% of train data
model_fit, test_evaluator, sbert_model = train(df, classifications, params, dataset_dir, sbert_model)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column].loc[df[column].isnull()] = df[column].loc[df[column].isnull()


['Markets', 'Layoff', 'Monetary', 'Using Products', 'External Hirings', 'Employee Growth', 'Product Growth', 'Corporate Reports', 'Being used', 'Ownerships', 'Coworks', 'Successes', 'Critics', 'Publishing Numbers', 'Unit Acquistion', 'Being sold', 'Events', 'Mandate service', 'Activities', 'Insolvency', 'Resellers', 'Product Investments', 'New Employees', 'Rights', 'IP Penalties', 'Employee Careers', 'Technology', 'Product Advertising', 'Competition', 'Products', 'Investments', 'Charity', 'Procure Supply', 'Facility Launches', 'Celebrations', 'Market Coverage', 'Promotions', 'Acquired', 'Launches', 'Leadership', 'Operations', 'Layoff Staff', 'Acquistions', 'Product Problems', 'Employee Misconduct', 'Launched Operations', 'Founding', 'Service Agreements', 'Financial Problems', 'Deals', 'Grow', 'Employees', 'Fighting Abuse', 'Failed Competence', 'Using', 'Staffing', 'Milestones', 'Hosting Events', 'Product Stop', 'Company Info', 'Market Share', 'Penalties', 'Operational Regions', 'Patent

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/97 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.8


Iteration:   0%|          | 0/97 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.89
