In [None]:
import spacy
import pandas as pd
import os
import zipfile

def ner_tags_and_entities(texts):
    doc = nlp(texts)
    ner_tags = [ent.label_ for ent in doc.ents]
    entities = [ent.text for ent in doc.ents]
    return ', '.join(ner_tags), ', '.join(entities)

nlp = spacy.load('en_core_web_sm')

input_zip_file = 'DiscoverNikkei.zip'
output_zip_file = 'Output_DiscoverNikkei.zip'
temp_dir = 'temp_dir'
os.makedirs(temp_dir)

ner_tags_list = []
entities_list = []

with zipfile.ZipFile(input_zip_file, 'r') as zip_file:
    with zipfile.ZipFile(output_zip_file, 'w') as output_zip:
        for filename in zip_file.namelist():
            if filename.endswith('.csv'):
                with zip_file.open(filename) as csv_file:
                    df = pd.read_csv(csv_file, encoding='latin1')
                    transcripts = df['Transcript (Narrator Only)']

                    file_ner_tags_list = []
                    file_entities_list = []

                    for transcript in transcripts:
                        if pd.notna(transcript):
                            ner_tags, entities = ner_tags_and_entities(transcript)
                            file_ner_tags_list.append(ner_tags)
                            file_entities_list.append(entities)

                        else:
                            file_ner_tags_list.append('')
                            file_entities_list.append('')

                    df['NER Tags'] = file_ner_tags_list
                    df['Entities'] = file_entities_list

                    output_csv_file = os.path.join(temp_dir, f'output_{filename}')
                    df.to_csv(output_csv_file, index=False, encoding='utf-8')

                    output_zip.write(output_csv_file, arcname=filename)


In [None]:
#For creating columns for each NER tags
import pandas as pd
import zipfile
from io import BytesIO

def process_csv(zip_file, csv_filename):
    with zip_file.open(csv_filename) as file:
        df = pd.read_csv(file)

    transcripts = []
    person_entities = []
    org_entities = []
    event_entities = []
    #Always in the 0th column
    document_id = df['Document ID'][0]
    #print(document_id)

    for index, row in df.iterrows():
        if pd.notna(row['NER Tags']):
            #Splitting the string into a list of tags
            tags = row['NER Tags'].split(', ')

            if any(tag in tags for tag in ['PERSON', 'ORG', 'EVENT']):
                entities = row['Entities'].split(', ')

                person_entities_row = ', '.join([entity for tag, entity in zip(tags, entities) if tag == 'PERSON'])
                org_entities_row = ', '.join([entity for tag, entity in zip(tags, entities) if tag == 'ORG'])
                event_entities_row = ', '.join([entity for tag, entity in zip(tags, entities) if tag == 'EVENT'])

                transcripts.append(row['Transcript (Narrator Only)'])
                person_entities.append(person_entities_row)
                org_entities.append(org_entities_row)
                event_entities.append(event_entities_row)

    result_df = pd.DataFrame({
        'Document ID': document_id,
        'Transcript (Narrator)': transcripts,
        'Person Entities': person_entities,
        'Org Entities': org_entities,
        'Event Entities': event_entities
    })

    return result_df

input_zip_path = 'Output_DiscoverNikkei.zip'
output_zip_path = 'DiscoverNikkei_For_EntityList.zip'

with zipfile.ZipFile(input_zip_path, 'r') as input_zip:
    with zipfile.ZipFile(output_zip_path, 'w') as output_zip:
        for csv_filename in input_zip.namelist():

            result_df = process_csv(input_zip, csv_filename)

            csv_bytes = BytesIO()
            result_df.to_csv(csv_bytes, index=False)
            csv_bytes.seek(0)

            output_zip.writestr(csv_filename, csv_bytes.read())

'4-33
'4-15
'4-41
'4-24
'4-23
'4-26
'4-16
'4-30
'4-10
'4-28
'4-14
'4-37
'4-34
'4-18
'4-22
'4-20
'4-1
'4-13
'4-35
'4-32
'4-38
'4-17
'4-12
'4-40
'4-11
'4-9
'4-5
'4-25
'4-6
'4-36
'4-7
'4-4
'4-29
'4-8
'4-39
'4-21
'4-31
'4-27
'4-3
'4-19
'4-2


In [None]:
#For creating lists each tags ('PERSON', 'ORG', 'EVENT')
import pandas as pd
import zipfile
import os
import tempfile

input_zip_file = 'DiscoverNikkei_For_EntityList.zip'
output_csv_file = 'DiscoverNikkei_Person_Entities_Output.csv'

with zipfile.ZipFile(input_zip_file, 'r') as input_zip:
    collected_df = pd.DataFrame()

    for file_info in input_zip.infolist():
        input_csv_path = input_zip.extract(file_info)

        df = pd.read_csv(input_csv_path)
        filtered_df = df[df['Person Entities'].notna()]
        collected_df = collected_df.append(filtered_df, ignore_index=True)

    temp_csv_fd, temp_csv_path = tempfile.mkstemp(suffix=".csv")
    os.close(temp_csv_fd)

    collected_df.to_csv(temp_csv_path, index=False)
    collected_df.to_csv(output_csv_file, index=False)

    os.remove(temp_csv_path)

#print("Processing complete. Filtered rows are saved in", output_csv_file)


  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.append(filtered_df, ignore_index=True)
  collected_df = collected_df.appe

UnicodeEncodeError: ignored

In [None]:
#For merging
import pandas as pd

files = ['Densho_Event_Entities_Output.csv', 'JASC_Event_Entities_Output.csv', 'JAMSJ_Event_Entities_Output.csv', 'DiscoverNikkei_Event_Entities_Output.csv']

dfs = [pd.read_csv(file, encoding='latin1') for file in files]
combined_df = pd.concat(dfs, ignore_index=True)

combined_df.to_csv('All_Event_Entities_Output.csv', index=False, na_rep='NaN')


In [None]:
!pip install transformers



In [None]:
#Tried BERT (Eventually we dodn't use this)
import pandas as pd
import os
import zipfile
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("autoevaluate/entity-extraction")
model = AutoModelForTokenClassification.from_pretrained("autoevaluate/entity-extraction")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

def ner_tags_and_entities(texts):
    entities = ner_pipeline(texts)
    ner_tags = [ent['entity'] for ent in entities]
    entity_texts = [ent['word'] for ent in entities]
    return ', '.join(ner_tags), ', '.join(entity_texts)

input_zip_file = 'JASC_Transcript.zip'
output_zip_file = 'Output_JASC.zip'
temp_dir = 'temp_dir'
os.makedirs(temp_dir)

ner_tags_list = []
entities_list = []

with zipfile.ZipFile(input_zip_file, 'r') as zip_file:
    with zipfile.ZipFile(output_zip_file, 'w') as output_zip:
        for filename in zip_file.namelist():
            if filename.endswith('.csv'):
                with zip_file.open(filename) as csv_file:
                    df = pd.read_csv(csv_file, encoding='latin1')
                    transcripts = df['Transcript (Narrator)']

                    file_ner_tags_list = []
                    file_entities_list = []

                    for transcript in transcripts:
                        if pd.notna(transcript):
                            ner_tags, entities = ner_tags_and_entities(transcript)
                            file_ner_tags_list.append(ner_tags)
                            file_entities_list.append(entities)
                        else:
                            file_ner_tags_list.append('')
                            file_entities_list.append('')

                    df['NER Tags'] = file_ner_tags_list
                    df['Entities'] = file_entities_list

                    output_csv_file = os.path.join(temp_dir, f'output_{filename}')
                    df.to_csv(output_csv_file, index=False, encoding='utf-8')

                    output_zip.write(output_csv_file, arcname=filename)
