# COLAB_LabelledDataAnalysis



This is a notebook to prepare the labelled token dataset for HuggingFace.

## 1. Installs and Imports

In [50]:
# !pip install datasets
# !pip install transformers
# !pip install s3fs
# !pip install boto3
# !pip install sagemaker

In [85]:
import os
import json
import s3fs
import boto3
import sagemaker 
import transformers
import numpy as np
import pandas as pd
from ast import literal_eval
from collections import Counter
from datasets import load_dataset
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

## 2. Permissions

In [3]:
system = "COLAB" #["AWS", "COLAB"]

In [4]:
if system=="AWS":
    fs = s3fs.S3FileSystem()    
    s3_bucket = "govuk-data-infrastructure-integration"
    DATA_DIR = f's3://{s3_bucket}/model-data/govner-data'
    for f in fs.ls(DATA_DIR):
        print(f)
    #Manage interactions with the Amazon SageMaker APIs and any other AWS services needed.
    # sagemaker session bucket -> used for uploading data, models and logs
    # sagemaker will automatically create this bucket if it not exists
    sess = sagemaker.Session() 
    sagemaker_session_bucket= s3_bucket
    if sagemaker_session_bucket is None and sess is not None:
        # set to default bucket if a bucket name is not given
        sagemaker_session_bucket = sess.default_bucket()
        
    role = sagemaker.get_execution_role()
    sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

    print(f"sagemaker role arn: {role}")
    print(f"sagemaker bucket: {sess.default_bucket()}")
    print(f"sagemaker session region: {sess.boto_region_name}")
elif system=="COLAB":
    from google.colab import drive
    drive.mount("/content/gdrive")
    #DATA_DIR = os.path.join("/content/gdrive/My Drive", "NER/Data")
    DATA_DIR = os.path.join("/content/gdrive/Shareddrives/", "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data")


Mounted at /content/gdrive


In [5]:
DATA_DIR

'/content/gdrive/Shareddrives/GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Data'

## 3. Load Dataset

In [6]:
# s3 key prefix for the data
dataset_name = 'line_by_line_NER_data_combined.csv'

dataset_path = f'{DATA_DIR}/{dataset_name}'

In [7]:
df = pd.read_csv(dataset_path, sep='\t')

In [106]:
df.shape

(347216, 9)

In [105]:
df.head()

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,original_file
0,This is known as amount C. Add together : Do n...,"[This, is, known, as, amount, C., Add, togethe...","[[17, 23, FINANCE], [61, 66, FINANCE], [74, 79...",,,/guidance/pension-schemes-value-your-pension-f...,True,"[O, O, O, O, FINANCE, O, O, O, O, O, O, O, O, ...",line_by_line_NER_data_sampled_09062020_more_en...
1,"DWP is responsible for welfare , pensions and ...","[DWP, is, responsible, for, welfare, ,, pensio...","[[0, 3, ORGANIZATION], [33, 41, FINANCE], [46,...",,,/guidance/contact-the-department-for-work-and-...,True,"[ORGANIZATION, O, O, O, O, O, FINANCE, O, PERS...",line_by_line_NER_data_sampled_12062020_more_en...
2,Initial teacher training ( ITT ) : accreditati...,"[Initial, teacher, training, (, ITT, ), :, acc...","[[8, 15, PERSON], [16, 24, STATE]]",,,/guidance/initial-teacher-training-itt-accredi...,True,"[O, PERSON, STATE, O, O, O, O, O, O, O, O]",line_by_line_NER_data_sampled_12062020_more_en...
3,All procured projects are expected to be compl...,"[All, procured, projects, are, expected, to, b...","[[41, 50, STATE], [57, 62, DATE]]",,,/guidance/china-prosperity-fund-bidding-round,True,"[O, O, O, O, O, O, O, STATE, O, O, DATE, O, O]",line_by_line_NER_data_sampled_09062020_more_en...
4,Excise Notice 2002 section 14 gives further in...,"[Excise, Notice, 2002, section, 14, gives, fur...","[[14, 18, DATE], [44, 55, CONTACT], [62, 67, O...",,,/guidance/the-alcohol-wholesaler-registration-...,True,"[O, O, DATE, O, O, O, O, CONTACT, O, ORGANIZAT...",line_by_line_NER_data_sampled_12062020_more_en...


Literal eval

In [10]:
for i in ['text_token', 'labels', 'label_list']:
  df[i] = df[i].apply(lambda x: literal_eval(x))

In [11]:
print(df['text_token'][0])
print(df['text_token'][0][0])

['This', 'is', 'known', 'as', 'amount', 'C.', 'Add', 'together', ':', 'Do', 'not', 'include', 'the', 'value', 'of', 'any', 'death', 'benefit', 'in', 'this', 'calculation', '.']
This


## Labelled Counts

Now, get the labelled counts for each row.

In [13]:
with open(f'{DATA_DIR}/new_label_map.json') as f:
  data = json.load(f)

label_map = data

In [16]:
labels = list(label_map.keys())

In [55]:
labels

['O',
 'CONTACT',
 'DATE',
 'EVENT',
 'FINANCE',
 'FORM',
 'LOCATION',
 'MISC',
 'MONEY',
 'ORGANIZATION',
 'PERSON',
 'SCHEME',
 'STATE']

In [66]:
test_labs = ['O', 'O', 'FINANCE', 'PERSON', 'O', 'O', 'FINANCE', 'O']

In [101]:
df_mini = df.head(5)
df_mini

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,original_file
0,This is known as amount C. Add together : Do n...,"[This, is, known, as, amount, C., Add, togethe...","[[17, 23, FINANCE], [61, 66, FINANCE], [74, 79...",,,/guidance/pension-schemes-value-your-pension-f...,True,"[O, O, O, O, FINANCE, O, O, O, O, O, O, O, O, ...",line_by_line_NER_data_sampled_09062020_more_en...
1,"DWP is responsible for welfare , pensions and ...","[DWP, is, responsible, for, welfare, ,, pensio...","[[0, 3, ORGANIZATION], [33, 41, FINANCE], [46,...",,,/guidance/contact-the-department-for-work-and-...,True,"[ORGANIZATION, O, O, O, O, O, FINANCE, O, PERS...",line_by_line_NER_data_sampled_12062020_more_en...
2,Initial teacher training ( ITT ) : accreditati...,"[Initial, teacher, training, (, ITT, ), :, acc...","[[8, 15, PERSON], [16, 24, STATE]]",,,/guidance/initial-teacher-training-itt-accredi...,True,"[O, PERSON, STATE, O, O, O, O, O, O, O, O]",line_by_line_NER_data_sampled_12062020_more_en...
3,All procured projects are expected to be compl...,"[All, procured, projects, are, expected, to, b...","[[41, 50, STATE], [57, 62, DATE]]",,,/guidance/china-prosperity-fund-bidding-round,True,"[O, O, O, O, O, O, O, STATE, O, O, DATE, O, O]",line_by_line_NER_data_sampled_09062020_more_en...
4,Excise Notice 2002 section 14 gives further in...,"[Excise, Notice, 2002, section, 14, gives, fur...","[[14, 18, DATE], [44, 55, CONTACT], [62, 67, O...",,,/guidance/the-alcohol-wholesaler-registration-...,True,"[O, O, DATE, O, O, O, O, CONTACT, O, ORGANIZAT...",line_by_line_NER_data_sampled_12062020_more_en...


In [102]:
df_mini

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,original_file
0,This is known as amount C. Add together : Do n...,"[This, is, known, as, amount, C., Add, togethe...","[[17, 23, FINANCE], [61, 66, FINANCE], [74, 79...",,,/guidance/pension-schemes-value-your-pension-f...,True,"[O, O, O, O, FINANCE, O, O, O, O, O, O, O, O, ...",line_by_line_NER_data_sampled_09062020_more_en...
1,"DWP is responsible for welfare , pensions and ...","[DWP, is, responsible, for, welfare, ,, pensio...","[[0, 3, ORGANIZATION], [33, 41, FINANCE], [46,...",,,/guidance/contact-the-department-for-work-and-...,True,"[ORGANIZATION, O, O, O, O, O, FINANCE, O, PERS...",line_by_line_NER_data_sampled_12062020_more_en...
2,Initial teacher training ( ITT ) : accreditati...,"[Initial, teacher, training, (, ITT, ), :, acc...","[[8, 15, PERSON], [16, 24, STATE]]",,,/guidance/initial-teacher-training-itt-accredi...,True,"[O, PERSON, STATE, O, O, O, O, O, O, O, O]",line_by_line_NER_data_sampled_12062020_more_en...
3,All procured projects are expected to be compl...,"[All, procured, projects, are, expected, to, b...","[[41, 50, STATE], [57, 62, DATE]]",,,/guidance/china-prosperity-fund-bidding-round,True,"[O, O, O, O, O, O, O, STATE, O, O, DATE, O, O]",line_by_line_NER_data_sampled_09062020_more_en...
4,Excise Notice 2002 section 14 gives further in...,"[Excise, Notice, 2002, section, 14, gives, fur...","[[14, 18, DATE], [44, 55, CONTACT], [62, 67, O...",,,/guidance/the-alcohol-wholesaler-registration-...,True,"[O, O, DATE, O, O, O, O, CONTACT, O, ORGANIZAT...",line_by_line_NER_data_sampled_12062020_more_en...


In [94]:
mlb = MultiLabelBinarizer()
df_mini['label_list_enc'] = df_mini['label_list']
df_mini = df_mini.join(pd.DataFrame(mlb.fit_transform(df_mini.pop('label_list_enc')),
                          columns=mlb.classes_,
                          index=df_mini.index))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [95]:
df_mini

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,original_file,CONTACT,DATE,EVENT,FINANCE,O,ORGANIZATION,PERSON,STATE
0,This is known as amount C. Add together : Do n...,"[This, is, known, as, amount, C., Add, togethe...","[[17, 23, FINANCE], [61, 66, FINANCE], [74, 79...",,,/guidance/pension-schemes-value-your-pension-f...,True,"[O, O, O, O, FINANCE, O, O, O, O, O, O, O, O, ...",line_by_line_NER_data_sampled_09062020_more_en...,0,0,1,1,1,0,0,0
1,"DWP is responsible for welfare , pensions and ...","[DWP, is, responsible, for, welfare, ,, pensio...","[[0, 3, ORGANIZATION], [33, 41, FINANCE], [46,...",,,/guidance/contact-the-department-for-work-and-...,True,"[ORGANIZATION, O, O, O, O, O, FINANCE, O, PERS...",line_by_line_NER_data_sampled_12062020_more_en...,0,0,0,1,1,1,1,0
2,Initial teacher training ( ITT ) : accreditati...,"[Initial, teacher, training, (, ITT, ), :, acc...","[[8, 15, PERSON], [16, 24, STATE]]",,,/guidance/initial-teacher-training-itt-accredi...,True,"[O, PERSON, STATE, O, O, O, O, O, O, O, O]",line_by_line_NER_data_sampled_12062020_more_en...,0,0,0,0,1,0,1,1
3,All procured projects are expected to be compl...,"[All, procured, projects, are, expected, to, b...","[[41, 50, STATE], [57, 62, DATE]]",,,/guidance/china-prosperity-fund-bidding-round,True,"[O, O, O, O, O, O, O, STATE, O, O, DATE, O, O]",line_by_line_NER_data_sampled_09062020_more_en...,0,1,0,0,1,0,0,1
4,Excise Notice 2002 section 14 gives further in...,"[Excise, Notice, 2002, section, 14, gives, fur...","[[14, 18, DATE], [44, 55, CONTACT], [62, 67, O...",,,/guidance/the-alcohol-wholesaler-registration-...,True,"[O, O, DATE, O, O, O, O, CONTACT, O, ORGANIZAT...",line_by_line_NER_data_sampled_12062020_more_en...,1,1,0,0,1,1,0,0


In [98]:
for i in range(len(df_mini)):
  print(df_mini['label_list'][i])

['O', 'O', 'O', 'O', 'FINANCE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'FINANCE', 'O', 'O', 'EVENT', 'FINANCE', 'O', 'O', 'O', 'O']
['ORGANIZATION', 'O', 'O', 'O', 'O', 'O', 'FINANCE', 'O', 'PERSON', 'O', 'O', 'O']
['O', 'PERSON', 'STATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'STATE', 'O', 'O', 'DATE', 'O', 'O']
['O', 'O', 'DATE', 'O', 'O', 'O', 'O', 'CONTACT', 'O', 'ORGANIZATION', 'O', 'O']


In [100]:
def occurence_flag(df, column):
  mlb = MultiLabelBinarizer()
  df_copy = df
  df_copy[f"{column}_cop"] = df_copy[f"{column}"]
  df_copy = df_copy.join(pd.DataFrame(mlb.fit_transform(df_copy.pop(f"{column}_cop")),
                          columns=mlb.classes_,
                          index=df_copy.index))
  return df_copy

In [103]:
df_counts = occurence_flag(df_mini, column='label_list')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [104]:
df_counts

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,original_file,CONTACT,DATE,EVENT,FINANCE,O,ORGANIZATION,PERSON,STATE
0,This is known as amount C. Add together : Do n...,"[This, is, known, as, amount, C., Add, togethe...","[[17, 23, FINANCE], [61, 66, FINANCE], [74, 79...",,,/guidance/pension-schemes-value-your-pension-f...,True,"[O, O, O, O, FINANCE, O, O, O, O, O, O, O, O, ...",line_by_line_NER_data_sampled_09062020_more_en...,0,0,1,1,1,0,0,0
1,"DWP is responsible for welfare , pensions and ...","[DWP, is, responsible, for, welfare, ,, pensio...","[[0, 3, ORGANIZATION], [33, 41, FINANCE], [46,...",,,/guidance/contact-the-department-for-work-and-...,True,"[ORGANIZATION, O, O, O, O, O, FINANCE, O, PERS...",line_by_line_NER_data_sampled_12062020_more_en...,0,0,0,1,1,1,1,0
2,Initial teacher training ( ITT ) : accreditati...,"[Initial, teacher, training, (, ITT, ), :, acc...","[[8, 15, PERSON], [16, 24, STATE]]",,,/guidance/initial-teacher-training-itt-accredi...,True,"[O, PERSON, STATE, O, O, O, O, O, O, O, O]",line_by_line_NER_data_sampled_12062020_more_en...,0,0,0,0,1,0,1,1
3,All procured projects are expected to be compl...,"[All, procured, projects, are, expected, to, b...","[[41, 50, STATE], [57, 62, DATE]]",,,/guidance/china-prosperity-fund-bidding-round,True,"[O, O, O, O, O, O, O, STATE, O, O, DATE, O, O]",line_by_line_NER_data_sampled_09062020_more_en...,0,1,0,0,1,0,0,1
4,Excise Notice 2002 section 14 gives further in...,"[Excise, Notice, 2002, section, 14, gives, fur...","[[14, 18, DATE], [44, 55, CONTACT], [62, 67, O...",,,/guidance/the-alcohol-wholesaler-registration-...,True,"[O, O, DATE, O, O, O, O, CONTACT, O, ORGANIZAT...",line_by_line_NER_data_sampled_12062020_more_en...,1,1,0,0,1,1,0,0


Apply to full dataframe

In [124]:
df_counts = occurence_flag(df, column='label_list')

In [125]:
df_counts.head(20)

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,original_file,CONTACT,DATE,EVENT,FINANCE,FORM,LOCATION,MISC,MONEY,O,ORGANIZATION,PERSON,SCHEME,STATE
0,This is known as amount C. Add together : Do n...,"[This, is, known, as, amount, C., Add, togethe...","[[17, 23, FINANCE], [61, 66, FINANCE], [74, 79...",,,/guidance/pension-schemes-value-your-pension-f...,True,"[O, O, O, O, FINANCE, O, O, O, O, O, O, O, O, ...",line_by_line_NER_data_sampled_09062020_more_en...,0,0,1,1,0,0,0,0,1,0,0,0,0
1,"DWP is responsible for welfare , pensions and ...","[DWP, is, responsible, for, welfare, ,, pensio...","[[0, 3, ORGANIZATION], [33, 41, FINANCE], [46,...",,,/guidance/contact-the-department-for-work-and-...,True,"[ORGANIZATION, O, O, O, O, O, FINANCE, O, PERS...",line_by_line_NER_data_sampled_12062020_more_en...,0,0,0,1,0,0,0,0,1,1,1,0,0
2,Initial teacher training ( ITT ) : accreditati...,"[Initial, teacher, training, (, ITT, ), :, acc...","[[8, 15, PERSON], [16, 24, STATE]]",,,/guidance/initial-teacher-training-itt-accredi...,True,"[O, PERSON, STATE, O, O, O, O, O, O, O, O]",line_by_line_NER_data_sampled_12062020_more_en...,0,0,0,0,0,0,0,0,1,0,1,0,1
3,All procured projects are expected to be compl...,"[All, procured, projects, are, expected, to, b...","[[41, 50, STATE], [57, 62, DATE]]",,,/guidance/china-prosperity-fund-bidding-round,True,"[O, O, O, O, O, O, O, STATE, O, O, DATE, O, O]",line_by_line_NER_data_sampled_09062020_more_en...,0,1,0,0,0,0,0,0,1,0,0,0,1
4,Excise Notice 2002 section 14 gives further in...,"[Excise, Notice, 2002, section, 14, gives, fur...","[[14, 18, DATE], [44, 55, CONTACT], [62, 67, O...",,,/guidance/the-alcohol-wholesaler-registration-...,True,"[O, O, DATE, O, O, O, O, CONTACT, O, ORGANIZAT...",line_by_line_NER_data_sampled_12062020_more_en...,1,1,0,0,0,0,0,0,1,1,0,0,0
5,It ’ s a good idea to make sure any charges re...,"[It, ’, s, a, good, idea, to, make, sure, any,...","[[36, 43, FINANCE], [44, 54, STATE], [68, 75, ...",,,/guidance/registering-a-charge-mortgage-for-a-...,True,"[O, O, O, O, O, O, O, O, O, O, FINANCE, STATE,...",line_by_line_NER_data_sampled_12062020_more_en...,0,0,0,1,0,0,0,0,1,1,0,0,1
6,This does not mean that HMRC will not agree su...,"[This, does, not, mean, that, HMRC, will, not,...","[[24, 28, ORGANIZATION], [55, 60, FORM]]",,,/guidance/self-assessment-expenses-and-benefit...,True,"[O, O, O, O, O, ORGANIZATION, O, O, O, O, FORM...",line_by_line_NER_data_sampled_12062020_more_en...,0,0,0,0,1,0,0,0,1,1,0,0,0
7,For journalists Email newsdesk @ fco.gov.uk Fo...,"[For, journalists, Email, newsdesk, @, fco.gov...","[[4, 15, PERSON], [16, 21, CONTACT], [73, 80, ...",,,/government/speeches/foreign-secretarys-statem...,True,"[O, PERSON, CONTACT, O, O, O, O, O, O, O, O, M...",line_by_line_NER_data_sampled_12062020_more_en...,1,0,0,0,0,0,1,0,1,0,1,0,0
8,This booklet is designed to help businesses lo...,"[This, booklet, is, designed, to, help, busine...","[[5, 12, CONTACT], [33, 43, ORGANIZATION], [13...",,,/government/publications/ip-health-check-agree...,True,"[O, CONTACT, O, O, O, O, ORGANIZATION, O, O, O...",line_by_line_NER_data_sampled_12062020_more_en...,1,0,0,1,0,0,0,0,1,1,0,0,0
9,Telephone : +44 300 790 6801 Textphone : 18001...,"[Telephone, :, +44, 300, 790, 6801, Textphone,...","[[0, 9, CONTACT], [29, 38, CONTACT], [70, 77, ...",,,/guidance/notarial-and-documentary-services-gu...,True,"[CONTACT, O, O, O, O, O, CONTACT, O, O, O, O, ...",line_by_line_NER_data_sampled_12062020_more_en...,1,1,0,0,0,1,0,0,1,0,0,0,1


## Inspect Elements

In [146]:
main_df = pd.DataFrame()

In [159]:
main_df = pd.DataFrame()
for l in labels:
  print(l)
  new_df = df_counts[df_counts[l] == 1]
  new_df = new_df.sample(2000)
  new_df['sample'] = l
  main_df = main_df.append(new_df)

O
CONTACT
DATE
EVENT
FINANCE
FORM
LOCATION
MISC
MONEY
ORGANIZATION
PERSON
SCHEME
STATE


In [160]:
main_df.shape

(26000, 23)

In [168]:
main_df['zip_tok_ent'] = main_df.apply(lambda x: list(zip(x.text_token,x.label_list)), axis=1)

In [169]:
main_df = main_df.reset_index()
main_df = main_df.drop(['index'], axis=1)
main_df

Unnamed: 0,text,text_token,labels,updated,original_labels,base_path,sampled,label_list,original_file,CONTACT,DATE,EVENT,FINANCE,FORM,LOCATION,MISC,MONEY,O,ORGANIZATION,PERSON,SCHEME,STATE,sample,zip_tok_ent
0,You don ’ t have to report and record the move...,"[You, don, ’, t, have, to, report, and, record...","[[20, 26, CONTACT], [31, 37, FORM], [113, 120,...",,,/guidance/keeping-a-pet-pig-or-micropig,True,"[O, O, O, O, O, O, CONTACT, O, FORM, O, O, O, ...",line_by_line_NER_data_sampled_12062020_more_en...,1,0,0,0,1,0,0,0,1,0,0,0,0,O,"[(You, O), (don, O), (’, O), (t, O), (have, O)..."
1,"Syria has repeatedly denied any role , and has...","[Syria, has, repeatedly, denied, any, role, ,,...","[[0, 5, LOCATION], [82, 95, EVENT]]",,,Sentence: 32705,False,"[LOCATION, O, O, O, O, O, O, O, O, O, O, O, O,...",line_by_line_NER_data_sampled_09062020_more_en...,0,0,1,0,0,1,0,0,1,0,0,0,0,O,"[(Syria, LOCATION), (has, O), (repeatedly, O),..."
2,Initial evidence suggests that some of these w...,"[Initial, evidence, suggests, that, some, of, ...","[[8, 16, FORM], [65, 71, DATE]]",,,/government/news/freight-train-derailment-at-e...,True,"[O, FORM, O, O, O, O, O, O, O, O, DATE, O, O, O]",line_by_line_NER_data_sampled_09062020_more_en...,0,1,0,0,1,0,0,0,1,0,0,0,0,O,"[(Initial, O), (evidence, FORM), (suggests, O)..."
3,For further information on what benefits you c...,"[For, further, information, on, what, benefits...","[[12, 23, CONTACT], [32, 40, FINANCE], [61, 66...",,,/guidance/living-in-panama,True,"[O, O, CONTACT, O, O, FINANCE, O, O, O, O, O, ...",line_by_line_NER_data_sampled_12062020_more_en...,1,0,0,1,1,1,0,0,1,0,0,0,0,O,"[(For, O), (further, O), (information, CONTACT..."
4,It works on behalf of the Secretary of State f...,"[It, works, on, behalf, of, the, Secretary, of...","[[3, 8, MISC], [26, 44, PERSON]]",,,/guidance/gosport-oil-fuel-depot-redevelopment,True,"[O, MISC, O, O, O, O, PERSON, PERSON, PERSON, ...",line_by_line_NER_data_sampled_09062020_more_en...,0,0,0,0,0,0,1,0,1,0,1,0,0,O,"[(It, O), (works, MISC), (on, O), (behalf, O),..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25995,From 24 March 2020 the British Embassy Yerevan...,"[From, 24, March, 2020, the, British, Embassy,...","[[8, 13, DATE], [23, 30, STATE], [77, 85, MISC...",,,/guidance/notarial-and-documentary-services-gu...,True,"[O, O, DATE, O, O, STATE, O, O, O, O, O, MISC,...",line_by_line_NER_data_sampled_09062020_more_en...,0,1,1,0,0,0,1,0,1,0,0,0,1,STATE,"[(From, O), (24, O), (March, DATE), (2020, O),..."
25996,American casualties have been mounting amid a ...,"[American, casualties, have, been, mounting, a...","[[0, 8, STATE], [79, 90, LOCATION], [135, 144,...",,,Sentence: 18724,False,"[STATE, O, O, O, O, O, O, O, O, O, O, O, LOCAT...",line_by_line_NER_data_sampled_09062020_more_en...,0,1,0,0,0,1,0,0,1,0,0,0,1,STATE,"[(American, STATE), (casualties, O), (have, O)..."
25997,It also says they gathered information on a Sy...,"[It, also, says, they, gathered, information, ...","[[27, 38, CONTACT], [44, 50, STATE], [73, 79, ...",,,Sentence: 9675,False,"[O, O, O, O, O, CONTACT, O, O, STATE, O, O, O,...",line_by_line_NER_data_sampled_12062020_more_en...,1,1,0,0,0,1,0,0,1,0,0,0,1,STATE,"[(It, O), (also, O), (says, O), (they, O), (ga..."
25998,An amendment to the 2013 WEEE Regulations on 1...,"[An, amendment, to, the, 2013, WEEE, Regulatio...","[[20, 24, DATE], [45, 55, DATE], [62, 72, STATE]]",,,/government/publications/weee-submitting-a-pro...,True,"[O, O, O, O, DATE, O, O, O, DATE, DATE, DATE, ...",line_by_line_NER_data_sampled_12062020_more_en...,0,1,0,0,0,0,0,0,1,0,0,0,1,STATE,"[(An, O), (amendment, O), (to, O), (the, O), (..."


In [170]:
main_df['zip_tok_ent'][:5]

0    [(You, O), (don, O), (’, O), (t, O), (have, O)...
1    [(Syria, LOCATION), (has, O), (repeatedly, O),...
2    [(Initial, O), (evidence, FORM), (suggests, O)...
3    [(For, O), (further, O), (information, CONTACT...
4    [(It, O), (works, MISC), (on, O), (behalf, O),...
Name: zip_tok_ent, dtype: object

In [171]:
main_df.to_excel(f"{DATA_DIR}/NER_data_combined_BREAKDOWN.xlsx", index=None)