# WikiNER Project

### Abdullah Aljbab
### Adele Haghighat Hoseini
### Fulayjan Alanazi 

In [30]:
#@title Requirements
!pip install -r requirements.txt
# !pip install torch
# !pip install numpy==1.16.0
# !pip install mxnet-cu92
# !pip install mxnet
# !pip install TensorFlow
# !pip install bert-embedding
# !pip install transformers
# !pip install tqdm
# !pip install pandas
# !pip install itertools
# !pip install operator
# !pip install sklearn.metrics
# !pip install nltk
# !pip install argparse
# !pip install spacy
# !pip install zmq
# !pip install contextlib
# !pip install pathlib
# !pip install re
# !pip install subprocess
# !pip install ast

In [31]:
#@title Importing Libraries
import json
import csv
import urllib.request
from tqdm import tqdm
import pandas as pd
import itertools
from operator import itemgetter
from sklearn import metrics 
from sklearn.metrics import f1_score
import nltk
import os
import argparse
import time

import numpy as np
import torch
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader
from torch.optim import SGD
# from bert_embedding import BertEmbedding

from transformers import BertTokenizer
from transformers import BertModel

import spacy
import torch
from transformers import BertModel

# from zmq.constants import NULL
# from contextlib import nullcontext

from pathlib import Path
import re

In [32]:
#@title Process
import subprocess
from ast import literal_eval

def run(command):
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    out, err = process.communicate()
    print(out.decode('utf-8').strip())

print('# CPU')
run('cat /proc/cpuinfo | egrep -m 1 "^model name"')
run('cat /proc/cpuinfo | egrep -m 1 "^cpu MHz"')
run('cat /proc/cpuinfo | egrep -m 1 "^cpu cores"')

print('# RAM')
run('cat /proc/meminfo | egrep "^MemTotal"')

print('# GPU')
run('lspci | grep VGA')

print('# OS')
run('uname -a')

# CPU
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
cpu MHz		: 2199.998
cpu cores	: 2
# RAM
MemTotal:       26690640 kB
# GPU

# OS
Linux fe94ba42868e 5.10.133+ #1 SMP Fri Aug 26 08:44:51 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux


In [None]:
#@title Variables

!wget https://qrank.wmcloud.org/download/qrank.csv.gz
!gunzip qrank.csv.gz

input_csv_path = 'qrank.csv'
output_csv_path = 'output.csv'

CoNLL_train='train.txt'
CoNLL_valid='valid.txt'
CoNLL_test='test.txt'

model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states = True,)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [34]:
#@title Reading CSV
def ReadingCSV(iPath,iterations):
  Qn=[]
  Qr=[]
  with open(iPath, newline='') as csvfile:
    data = csv.reader(csvfile)
    for row in itertools.islice(data,1,iterations+1):
      Qn.append(row[0])
      Qr.append(row[1])
  return Qn,Qr

In [35]:
#@title wiki API: Reutrning List
def wikiAPI(Qn,iterations):
  lis=[]
  for i in range(iterations):
    x=urllib.request.urlopen('https://www.wikidata.org/w/api.php?action=wbgetentities&props=labels&ids='+Qn[i]+'&languages=en&format=json')
    y=json.loads(x.read())
    lis.append(y['entities'][Qn[i]]['labels']['en']['value'])

  return lis

In [36]:
#@title BIO

def BIO(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    lable_docs =[]
    for doc in raw_docs:
        tokens = []
        tags = []
        lables=[]
        for i,(line) in enumerate(doc.split('\n')):
          x=line.split()
          tokens.append(x[0])
          tags.append(x[3][:1])
          lables.append(x[3])
        token_docs.append(tokens)
        tag_docs.append(tags)
        lable_docs.append(lables)

    return token_docs, tag_docs, lable_docs

In [37]:
#@title Exporting Results: ToOutput
def ToOutput(fileName,Col1,Col2,Col3,iterations):

  i=1
  with open(fileName, 'w', encoding="UTF-8") as csvfile:
    fieldnames = ['Qnumber','Qrank','Lable']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for i in range (iterations):
      writer.writerow({'Qnumber':Col1[i],  'Qrank':Col2[i], 'Lable':Col3[i]})
      i+=1

In [38]:
#@title Test on CoNLL
def test_on_CoNLL(sentences, ground_truth_tag_sequences, NE_list, scheme="BIO"):
  performance_score = 0 
  for sen,gt in zip(sentences,ground_truth_tag_sequences):

      tag_seq = brutal_force_NER(sen, NE_list, scheme=scheme) #predicted_tag_seq)
      performance_score += a_judge_function(tag_seq, gt) #ground_truth_tag_sequences

  return performance_score/len(sentences)

In [39]:
#@title Brutal Force NER: Constructing [B-I-O]

# from zmq.constants import NULL
# from contextlib import nullcontext

def brutal_force_NER(sentence, NE_list, scheme="BIO"):

    for NE in NE_list:
            pattern = ''
            position = sentence.find(NE)
            
            if position > -1: 
                for index, conx in enumerate(NE.split()): 
                    if index == 0:
                        pattern += 'B'
                    else:
                        pattern += ' @I'              
                sentence = sentence.replace(NE, pattern)
            else:
                pass          
    GTT = []
    for i in sentence.split():
        if i not in ['B', '@I', 'O']:
            GTT.append('O')
        else:
            if i == '@I':
                GTT.append('I')
            else:
                GTT.append(i)
    return GTT

In [40]:
#@title Judge Function: Returning F1 score
def a_judge_function(predicted_tag_seq, grouth_truth_tag_seq):

    score = f1_score(grouth_truth_tag_seq, predicted_tag_seq,average="weighted")

    return score 

In [41]:
#@title Get Contextual Embeddings
def get_contextual_embeddings(tokens):
  # Load the BERT model
  model = BertModel.from_pretrained('bert-base-uncased')
  # Put the model in evalutation mode
  model.eval()
  
  # Convert the tokens to torch tensors
  tokens_tensor = torch.tensor([tokens])
  
  # Obtain the contextual embeddings for the tokens
  with torch.no_grad():
    outputs = model(tokens_tensor)
    embeddings = outputs[0]

  return embeddings

In [None]:
#@title Main
loop_N = [100,1000,1500] # we could not run it with more than 2000
if __name__ == '__main__':
  for i in range(len(loop_N)):
      start = time.time()

      top_N=loop_N[i]
      Qnumber,Qrank=ReadingCSV(input_csv_path,top_N)

      Wiki_NE_List = wikiAPI(Qnumber, top_N)

      Sents, GTTS,lables = BIO(CoNLL_train)
      new_word=[]
      for i,(words) in enumerate(Sents):
        sentence = ' '.join(words)
        new_word.extend([sentence])
      Sents=new_word

      ToOutput(output_csv_path,Qnumber,Qrank,Wiki_NE_List,top_N)
      print ('Befor sorting: ',len(Wiki_NE_List),' of Wiki_NE_List')
      CoNLL= test_on_CoNLL(Sents,GTTS,Wiki_NE_List)
      print('Score:',"{:.2%}".format(CoNLL))
      
      Wiki_NE_List.sort(key=len, reverse=True)
      print ('\nAfter sorting: ',len(Wiki_NE_List),' of Wiki_NE_List')
      CoNLL= test_on_CoNLL(Sents,GTTS,Wiki_NE_List)
      print('Score:',"{:.2%}".format(CoNLL))

      elapsed_time = time.time() - start
      print('\nExecution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
      print('------------------------------------------------\n')      