In [6]:
import pickle
import pandas as pd
import plotly.offline as pyo
import plotly.express as px
import dimensionality_reduction
import json 
import random
from numpy import dot
from numpy.linalg import norm
import numpy as np
from utils import aggregate_embeddings, cosine_similarity, print_similarity_samples, replace_nan_with, load_model, load_big_consulting_export, split_into_list
from utils import load_ai_news
from tqdm import tqdm

tqdm.pandas()

In [7]:
def label_data(df):
    data_list = []
    for index, row in tqdm(df.iterrows()):
        context = row['snippet']

        qas_list = []

        def get_qas_item(question, company):
            answer_found = False
            if company is None:
                return {
                    'id': str(index).zfill(5),
                    'is_impossible': True,
                    'question': question,
                    'answers': []
                }
            company_words = company.split()

            for i in range(len(company_words), 0, -1):
                company_part = ' '.join(company_words[:i])
                if company_part in context:
                    answer_found = True
                    answer_text = company_part
        #             answer_start = context.find(answer_text)
                    break

            if not answer_found:
                answer_text = company_words[0]
            answer_start = context.find(answer_text)

            qas_item = {
                'id': str(index).zfill(5),
                'is_impossible': False,
                'question': question,
                'answers': [{
                    'text': answer_text,
                    'answer_start': answer_start
                }]
            }


            return qas_item

        # If 'relationEntityType' is 'company', remove the value in 'relationEntity'
        df.at[index, 'relationEntity'] = None


        qas_item = get_qas_item("What is the company's name?", row['company'])
        if row['relationEntityType'] == 'company' and not pd.isnull(row['relationEntity']):
            qas_item_2 = get_qas_item("What is the other company's name?", row['relationEntity'])
        else:
            qas_item_2 = get_qas_item("What is the other company's name?", None)
        qas_list.append(qas_item)
        qas_list.append(qas_item_2)

        data_item = {
            'context': context,
            'qas': qas_list
        }

        data_list.append(data_item)
    return data_list

big_consulting_df = load_big_consulting_export()
data_list = label_data(big_consulting_df)
data_list[:10]

12063it [00:00, 34767.50it/s]


[{'context': 'at KPMG where he focused on complex financial matters and financial trading, investment banking and lending businesses',
  'qas': [{'id': '00000',
    'is_impossible': False,
    'question': "What is the company's name?",
    'answers': [{'text': 'KPMG', 'answer_start': 3}]},
   {'id': '00000',
    'is_impossible': True,
    'question': "What is the other company's name?",
    'answers': []}]},
 {'context': 'LatentView has been recognized as an industry leader by Gartner',
  'qas': [{'id': '00001',
    'is_impossible': False,
    'question': "What is the company's name?",
    'answers': [{'text': 'Gartner', 'answer_start': 56}]},
   {'id': '00001',
    'is_impossible': True,
    'question': "What is the other company's name?",
    'answers': []}]},
 {'context': 'In his last role, Prashant was playing the role of Portfolio & Regional Head at Infosys, as a key leader for their Communications, Media and Entertainment business',
  'qas': [{'id': '00002',
    'is_impossible': 

In [8]:
random.seed(42)
random.shuffle(data_list)
test_data = data_list[:100]
val_data = data_list[100:200]
train_data = data_list[200:]

In [9]:
import json
import logging
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs
 
model_type="bert"
model_name= "bert-base-cased"
 
train_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "use_cached_eval_features": True,
#     "output_dir": f"outputs/{model_type}",
#     "best_model_dir": f"{output_dir}/{model_type}/best_model",
    "evaluate_during_training": True,
    "max_seq_length": 128,
    "num_train_epochs": 2,
    "evaluate_during_training_steps": 100,
    "save_model_every_epoch": False,
    "save_eval_checkpoints": False,
    "n_best_size":8,
    "train_batch_size": 16,
    "eval_batch_size": 16
}
model = QuestionAnsweringModel(model_type,model_name, args=train_args, use_cuda=False)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# https://thinkinfi.com/fine-tune-bert-for-question-answering/
model.train_model(train_data, eval_data=test_data)
result, texts = model.eval_model(test_data)

convert squad examples to features:   0%|                                     | 0/23726 [00:00<?, ?it/s]Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Gartner'
Could not find answer: '' vs. 'America'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Fnts'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Gartner'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'PricewaterhouseCoo

Could not find answer: '' vs. 'Pcep'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. '41st'
Could not find answer: '' vs. 'Infosys'
Could not find answer: '' vs. 'Infosys'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Auditclub'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'LW'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Tata'
Co

Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Xjera'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'Infosys'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Globalfoundries'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'Ey'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Omron'
Could not find answer: '' vs. 'P

Could not find answer: '' vs. 'Linuxone'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'SAS-ALC'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'Booz'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Infosys'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Wipro'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'PricewaterhouseCoopers'


Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Ey'
Could not find answer: '' vs. 'AlixPartners'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Ey'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'Bain'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could n

Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'E&Y'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs.

Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'AlixPartners'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'Gartner'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'Gartner'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Ey'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Wal-Mart.com'
Coul

Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'America'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'Xjera'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '

Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Global'
Could not find answer: '' vs. 'Fti'
Could not find answer: '' vs. 'Ernst'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Tata'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'Aon'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
Could not find answer: '' vs. 'IBM'
Could not find answer: '' vs. 'PricewaterhouseCoopers'
convert squad examples to features: 100%|███████████████████████| 23726/23726 [00:04<00:00, 5661.09it/s]
add example index and unique id: 100%|███████████████████████| 23726/23726 [00:00<00:00, 1884200.64it/s]


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/1336 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/1336 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

In [11]:
result, texts

({'correct': 87,
  'similar': 12,
  'incorrect': 1,
  'eval_loss': -7.92989103610699},
 {'correct_text': {'04165': '',
   '05680': '',
   '08171': '',
   '07163': '',
   '05207': '',
   '13343': '',
   '10926': '',
   '11356': '',
   '10008': '',
   '11150': '',
   '02421': '',
   '13115': '',
   '06551': '',
   '09305': '',
   '06138': '',
   '01814': '',
   '11872': '',
   '00196': '',
   '11553': '',
   '05293': '',
   '08185': '',
   '04640': '',
   '13350': '',
   '07424': '',
   '08832': '',
   '00238': '',
   '10940': '',
   '04952': '',
   '07046': 'Threadgill Financial LLC',
   '10653': '',
   '04653': '',
   '09520': '',
   '07879': '',
   '12744': '',
   '03979': '',
   '04821': '',
   '11664': '',
   '06939': '',
   '09433': '',
   '01889': '',
   '08702': '',
   '03244': '',
   '11501': '',
   '09493': '',
   '11981': '',
   '10096': '',
   '11574': '',
   '04968': '',
   '12465': '',
   '11021': '',
   '12859': '',
   '09284': '',
   '01905': '',
   '07100': '',
   '02122

In [12]:
def get_inference_data(df):
    data_list = []
    for index, row in tqdm(df.iterrows()):
        context = row['snippet']

        qas_list = []

        def get_qas_inference_item(question):

            qas_item = {
                'id': str(index).zfill(5),
                'question': question,
            }

            return qas_item

        qas_item = get_qas_inference_item("What is the company's name?")
        qas_item_2 = get_qas_inference_item("What is the other company's name?")
        qas_list.append(qas_item)
        qas_list.append(qas_item_2)

        data_item = {
            'context': context,
            'qas': qas_list
        }

        data_list.append(data_item)
    return data_list

news_df = load_ai_news()
inference_data = get_inference_data(news_df)
inference_data[:10]

262523it [00:06, 40508.87it/s]


[{'context': '( Get Free Report CS Disco, Inc provides cloud-native and 1-powered legal solutions for ediscovery, legal document review, and case management for enterprises, law firms, legal services providers, and governments.',
  'qas': [{'id': '00000', 'question': "What is the company's name?"},
   {'id': '00000', 'question': "What is the other company's name?"}]},
 {'context': 'Some additional features SoMee offers include an 1 platform that offers 1 services for voiceovers, images, c',
  'qas': [{'id': '00001', 'question': "What is the company's name?"},
   {'id': '00001', 'question': "What is the other company's name?"}]},
 {'context': 'Exscientia plc, an 1-driven pharmatech company, engages in discovering, designing, and developing drugs.',
  'qas': [{'id': '00002', 'question': "What is the company's name?"},
   {'id': '00002', 'question': "What is the other company's name?"}]},
 {'context': "But what allows CrowdStrike to stand apart from its peers is Falcon , the company's clo

In [13]:
# Load model from training checkpoint
# from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs

# model = QuestionAnsweringModel("bert", "outputs/bert/best_model")


# Make predictions with the model
to_predict = [
    {
        "context": "Samsung Galaxy M14 5G (Smoky Teal, 6GB, 128GB Storage) | 50MP Triple Cam | 6000 mAh Battery | 5nm Octa-Core Processor | 12GB RAM with RAM Plus | Android 13 | Without Charger",
        "qas": [
            {
                "question": "What is the model name of the Samsung smartphone?",
                "id": "0",
            }
        ],
    }
]

answers, probabilities = model.predict(inference_data, n_best_size=None)
print(answers[:10])
# Run at night

convert squad examples to features: 100%|█████████████████████| 525046/525046 [01:55<00:00, 4549.81it/s]
add example index and unique id: 100%|█████████████████████| 525046/525046 [00:00<00:00, 1604751.83it/s]


Running Prediction:   0%|          | 0/32892 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [55]:
import pandas as pd

# Read the original file
input_file_path = '../glanos-data/clustering/big_consulting_export_replace.tsv'
df = pd.read_csv(input_file_path, sep='\t')

# Drop duplicates based on the 'replace' column
df_no_duplicates = df.drop_duplicates(subset='replace')

# Save the DataFrame to a new file
output_file_path = '../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv'
df_no_duplicates.to_csv(output_file_path, sep='\t', index=False)
