In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install Levenshtein

Collecting Levenshtein
  Downloading Levenshtein-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.1.0 (from Levenshtein)
  Downloading rapidfuzz-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.24.0 rapidfuzz-3.6.1


In [22]:
pip install wikipedia-api



In [23]:
import spacy
import os
import csv
from collections import Counter
import requests
from Levenshtein import ratio
import requests
import time
import wikipediaapi

In [4]:
#SpaCy model for Named Entity Recognition
nlp = spacy.load('en_core_web_sm')

In [5]:
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [6]:
def extract_entities(text):
  # text = remove_punctuation(text)
  doc = nlp(text)
  entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "GPE", "PERSON"]]
  return entities

In [42]:
REQUEST_COUNTER = 0
START_TIME = time.time()

def get_wikipedia_canonical_name(entity):
    global REQUEST_COUNTER, START_TIME

    # Define the maximum requests allowed and the time window (3600 seconds = 1 hour)
    MAX_REQUESTS = 5000
    TIME_WINDOW = 36  # seconds

    # Calculate the time elapsed since the start of the tracking period
    current_time = time.time()
    elapsed_time = current_time - START_TIME

    # Reset the counter and the start time if the time window has passed
    if elapsed_time > TIME_WINDOW:
        REQUEST_COUNTER = 0
        START_TIME = current_time

    # Check if the request limit has been reached
    if REQUEST_COUNTER >= MAX_REQUESTS:
        # Calculate the remaining time until the rate limit window resets
        remaining_time = TIME_WINDOW - elapsed_time

        # Log the wait time and wait until the window resets
        print(f"Rate limit reached. Waiting for {remaining_time} seconds before making more requests.")
        time.sleep(remaining_time + 1)  # Adding a small buffer to ensure the limit period has fully elapsed

        # Reset the counter and the start time after waiting
        REQUEST_COUNTER = 0
        START_TIME = time.time()

    # Proceed with the request
    wikipedia_api_url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'format': 'json',
        'titles': entity,
        'redirects': 1,
    }

    response = requests.get(wikipedia_api_url, params=params)
    data = response.json()

    # Increment the request counter after a successful request
    REQUEST_COUNTER += 1

    pages = data.get('query', {}).get('pages', {})
    for page_id, page_info in pages.items():
        if page_id != '-1':
            return page_info.get('title')

    return entity

In [43]:
def merge_similar_entities(entities, threshold=0.85):
    merged_entities = Counter()
    for entity, count in entities.items():
        similar_entity = next((merged_entity for merged_entity in merged_entities
                               if ratio(merged_entity.lower(), entity.lower()) > threshold), None)
        if similar_entity:
            merged_entities[similar_entity] += count
        else:
            merged_entities[entity] = count
    return merged_entities

In [44]:
def process_entities(entities):
    canonical_entities = Counter()
    for entity in entities:
        canonical_name = get_wikipedia_canonical_name(entity)
        canonical_entities[canonical_name] += 1
    return canonical_entities

In [45]:
# def remove_punctuation(text):
#     return ''.join([char for char in text if char not in string.punctuation])

## **English Hindi Entities**

In [46]:
folder1_path = "/content/drive/MyDrive/thesis-data/Articles_Translated_SD/Hindi_Articles"
folder2_path = "/content/drive/MyDrive/thesis-data/Articles_Translated_SD/English_Articles"

output_csv = "/content/drive/MyDrive/thesis-data/all_common_entities_eng_hi.csv"


with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File Name", "Common Entity", "Count"])

    for file_name in os.listdir(folder1_path):
        file1 = os.path.join(folder1_path, file_name)
        file2 = os.path.join(folder2_path, file_name)

        if os.path.isfile(file1) and os.path.isfile(file2):

            text1 = read_text_file(file1)
            text2 = read_text_file(file2)

            entities1 = process_entities(extract_entities(text1))
            entities2 = process_entities(extract_entities(text2))

            # Merge entities with similar names using Levenshtein distance
            entities1 = merge_similar_entities(entities1)
            entities2 = merge_similar_entities(entities2)

            common_entities = entities1 & entities2  # Intersection of two Counters

            filtered_common_entities = {entity: count for entity, count in common_entities.items() if count > 1}


            for entity, count in filtered_common_entities.items():
                writer.writerow([file_name, entity, count])

## **English Chinese Entities**

In [47]:
folder1_path = "/content/drive/MyDrive/thesis-data/Articles_Translated_SD/Chinese_Articles"
folder2_path = "/content/drive/MyDrive/thesis-data/Articles_Translated_SD/English_Articles"

output_csv = "/content/drive/MyDrive/thesis-data/all_common_entities_eng_zh.csv"


with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File Name", "Common Entity", "Count"])

    for file_name in os.listdir(folder1_path):
        file1 = os.path.join(folder1_path, file_name)
        file2 = os.path.join(folder2_path, file_name)

        if os.path.isfile(file1) and os.path.isfile(file2):

            text1 = read_text_file(file1)
            text2 = read_text_file(file2)

            entities1 = process_entities(extract_entities(text1))
            entities2 = process_entities(extract_entities(text2))

            # Merge entities with similar names using Levenshtein distance
            entities1 = merge_similar_entities(entities1)
            entities2 = merge_similar_entities(entities2)

            common_entities = entities1 & entities2  # Intersection of two Counters

            filtered_common_entities = {entity: count for entity, count in common_entities.items() if count > 1}


            for entity, count in filtered_common_entities.items():
                writer.writerow([file_name, entity, count])

## **English and Afrikaans Entities**

In [48]:
folder1_path = "/content/drive/MyDrive/thesis-data/Articles_Translated_SD/Afrikaans_Articles"
folder2_path = "/content/drive/MyDrive/thesis-data/Articles_Translated_SD/English_Articles"

output_csv = "/content/drive/MyDrive/thesis-data/all_common_entities_eng_af.csv"


with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File Name", "Common Entity", "Count"])

    for file_name in os.listdir(folder1_path):
        file1 = os.path.join(folder1_path, file_name)
        file2 = os.path.join(folder2_path, file_name)

        if os.path.isfile(file1) and os.path.isfile(file2):

            text1 = read_text_file(file1)
            text2 = read_text_file(file2)

            entities1 = process_entities(extract_entities(text1))
            entities2 = process_entities(extract_entities(text2))

            # Merge entities with similar names using Levenshtein distance
            entities1 = merge_similar_entities(entities1)
            entities2 = merge_similar_entities(entities2)

            common_entities = entities1 & entities2  # Intersection of two Counters

            filtered_common_entities = {entity: count for entity, count in common_entities.items() if count > 1}


            for entity, count in filtered_common_entities.items():
                writer.writerow([file_name, entity, count])

## **Chinese and Afrikaans Entities**

In [49]:
folder1_path = "/content/drive/MyDrive/thesis-data/Articles_Translated_SD/Chinese_Articles"
folder2_path = "/content/drive/MyDrive/thesis-data/Articles_Translated_SD/Afrikaans_Articles"

output_csv = "/content/drive/MyDrive/thesis-data/all_common_entities_zh_af.csv"


with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File Name", "Common Entity", "Count"])

    for file_name in os.listdir(folder1_path):
        file1 = os.path.join(folder1_path, file_name)
        file2 = os.path.join(folder2_path, file_name)

        if os.path.isfile(file1) and os.path.isfile(file2):

            text1 = read_text_file(file1)
            text2 = read_text_file(file2)

            entities1 = process_entities(extract_entities(text1))
            entities2 = process_entities(extract_entities(text2))

            # Merge entities with similar names using Levenshtein distance
            entities1 = merge_similar_entities(entities1)
            entities2 = merge_similar_entities(entities2)

            common_entities = entities1 & entities2  # Intersection of two Counters

            filtered_common_entities = {entity: count for entity, count in common_entities.items() if count > 1}


            for entity, count in filtered_common_entities.items():
                writer.writerow([file_name, entity, count])

## **Chinese Hindi Entities**

In [50]:
folder1_path = "/content/drive/MyDrive/thesis-data/Articles_Translated_SD/Hindi_Articles"
folder2_path = "/content/drive/MyDrive/thesis-data/Articles_Translated_SD/Chinese_Articles"

output_csv = "/content/drive/MyDrive/thesis-data/all_common_entities_zh_hi.csv"


with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File Name", "Common Entity", "Count"])

    for file_name in os.listdir(folder1_path):
        file1 = os.path.join(folder1_path, file_name)
        file2 = os.path.join(folder2_path, file_name)

        if os.path.isfile(file1) and os.path.isfile(file2):

            text1 = read_text_file(file1)
            text2 = read_text_file(file2)

            entities1 = process_entities(extract_entities(text1))
            entities2 = process_entities(extract_entities(text2))

            # Merge entities with similar names using Levenshtein distance
            entities1 = merge_similar_entities(entities1)
            entities2 = merge_similar_entities(entities2)

            common_entities = entities1 & entities2  # Intersection of two Counters

            filtered_common_entities = {entity: count for entity, count in common_entities.items() if count > 1}


            for entity, count in filtered_common_entities.items():
                writer.writerow([file_name, entity, count])

## **Afrikaans and Hindi Entities**

In [51]:
folder1_path = "/content/drive/MyDrive/thesis-data/Articles_Translated_SD/Hindi_Articles"
folder2_path = "/content/drive/MyDrive/thesis-data/Articles_Translated_SD/Afrikaans_Articles"

output_csv = "/content/drive/MyDrive/thesis-data/all_common_entities_af_hi.csv"


with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File Name", "Common Entity", "Count"])

    for file_name in os.listdir(folder1_path):
        file1 = os.path.join(folder1_path, file_name)
        file2 = os.path.join(folder2_path, file_name)

        if os.path.isfile(file1) and os.path.isfile(file2):

            text1 = read_text_file(file1)
            text2 = read_text_file(file2)

            entities1 = process_entities(extract_entities(text1))
            entities2 = process_entities(extract_entities(text2))

            # Merge entities with similar names using Levenshtein distance
            entities1 = merge_similar_entities(entities1)
            entities2 = merge_similar_entities(entities2)

            common_entities = entities1 & entities2  # Intersection of two Counters

            filtered_common_entities = {entity: count for entity, count in common_entities.items() if count > 1}


            for entity, count in filtered_common_entities.items():
                writer.writerow([file_name, entity, count])