# Step 1: Load the Excel File

In [2]:
import pandas as pd

def load_entity_data(file_path):
    # Load the Excel file
    xls = pd.ExcelFile(file_path)
    
    # Read each sheet into a list, assuming the first column contains the data
    per_data = xls.parse('PER')['Entity'].tolist()
    loc_data = xls.parse('LOC')['Entity'].tolist()
    org_data = xls.parse('ORG')['Entity'].tolist()
    
    return per_data, loc_data, org_data

# Example usage:
file_path = 'ckner_ec.xlsx'  # Replace with your correct file path
per_data, loc_data, org_data = load_entity_data(file_path)

# Step 2: Define the Rule-Based Functions

In [3]:
def is_person(phrase, per_data):
    return phrase in per_data

def is_organization(phrase, org_data):
    return phrase in org_data

def is_location(phrase, loc_data):
    return phrase in loc_data

def apply_rules_to_text(text, per_data, org_data, loc_data):
    tokens = text.split()  # Split the text into words
    labels = ['O'] * len(tokens)  # Initialize all labels as 'O'
    length = len(tokens)
    
    # Check for multi-word organizations and locations first
    for i in range(length):
        for j in range(length, i, -1):
            phrase = ' '.join(tokens[i:j])
            if is_person(phrase, per_data):
                labels[i:j] = ['B-PER'] + ['I-PER'] * (j - i - 1)
                break  # Move to the next starting position once matched
            elif is_organization(phrase, org_data):
                labels[i:j] = ['B-ORG'] + ['I-ORG'] * (j - i - 1)
                break
            elif is_location(phrase, loc_data):
                labels[i:j] = ['B-LOC'] + ['I-LOC'] * (j - i - 1)
                break

    return list(zip(tokens, labels))

# Step 3: Apply the Rules to a Text

In [7]:
# Sample text that includes multi-word entities
sample_text = "مەهدی لە نەتەوە یەکگرتووەکان کار دەکات و لە سلێمانی و کۆمپانیای قەیوان گرووپ و کوردستان دەژی"
annotated_text = apply_rules_to_text(sample_text, per_data, org_data, loc_data)
print(annotated_text)

[('مەهدی', 'O'), ('لە', 'O'), ('نەتەوە', 'B-ORG'), ('یەکگرتووەکان', 'I-ORG'), ('کار', 'O'), ('دەکات', 'O'), ('و', 'O'), ('لە', 'O'), ('سلێمانی', 'B-LOC'), ('و', 'O'), ('کۆمپانیای', 'B-ORG'), ('قەیوان', 'B-ORG'), ('گرووپ', 'I-ORG'), ('و', 'O'), ('کوردستان', 'B-LOC'), ('دەژی', 'O')]


# Step 4: Save the Rule-Based Model

In [8]:
import pickle

def save_rule_based_model(per_data, org_data, loc_data, filename='rule_based_ner.pkl'):
    model_data = {'per_data': per_data, 'org_data': org_data, 'loc_data': loc_data}
    with open(filename, 'wb') as f:
        pickle.dump(model_data, f)

# Save the model
save_rule_based_model(per_data, org_data, loc_data)

# Step 5: Load and Use the Model

In [9]:
def load_rule_based_model(filename='rule_based_ner.pkl'):
    with open(filename, 'rb') as f:
        model_data = pickle.load(f)
    return model_data['per_data'], model_data['org_data'], model_data['loc_data']

# Load the model
per_data, org_data, loc_data = load_rule_based_model()

# FUll Implementation

In [12]:
import pandas as pd
import pickle

# Step 1: Load the entity data from the Excel file
def load_entity_data(file_path):
    xls = pd.ExcelFile(file_path)
    per_data = xls.parse('PER')['Entity'].tolist()
    loc_data = xls.parse('LOC')['Entity'].tolist()
    org_data = xls.parse('ORG')['Entity'].tolist()
    return per_data, loc_data, org_data

# Step 2: Implement the rule-based functions
def is_person(phrase, per_data):
    return phrase in per_data

def is_organization(phrase, org_data):
    return phrase in org_data

def is_location(phrase, loc_data):
    return phrase in loc_data

# Step 3: Apply the rules to a given text
def apply_rules_to_text(text, per_data, org_data, loc_data):
    tokens = text.split()  # Split the text into words
    labels = ['O'] * len(tokens)  # Initialize all labels as 'O'
    length = len(tokens)
    
    # Check for multi-word organizations and locations first
    for i in range(length):
        for j in range(length, i, -1):
            phrase = ' '.join(tokens[i:j])
            if is_person(phrase, per_data):
                labels[i:j] = ['B-PER'] + ['I-PER'] * (j - i - 1)
                break  # Move to the next starting position once matched
            elif is_organization(phrase, org_data):
                labels[i:j] = ['B-ORG'] + ['I-ORG'] * (j - i - 1)
                break
            elif is_location(phrase, loc_data):
                labels[i:j] = ['B-LOC'] + ['I-LOC'] * (j - i - 1)
                break

    return list(zip(tokens, labels))

# Step 4: Save the rule-based model
def save_rule_based_model(per_data, org_data, loc_data, filename='rule_based_ner.pkl'):
    model_data = {'per_data': per_data, 'org_data': org_data, 'loc_data': loc_data}
    with open(filename, 'wb') as f:
        pickle.dump(model_data, f)

# Step 5: Load the rule-based model
def load_rule_based_model(filename='rule_based_ner.pkl'):
    with open(filename, 'rb') as f:
        model_data = pickle.load(f)
    return model_data['per_data'], model_data['org_data'], model_data['loc_data']

# Example usage
file_path = 'ckner_ec.xlsx'  # Replace with your correct file path
per_data, loc_data, org_data = load_entity_data(file_path)

# Test with a sample Kurdish Sorani text
sample_text = "ئاگا لە نەتەوە یەکگرتووەکان کار دەکات و لە کوردستان دەژی"
annotated_text = apply_rules_to_text(sample_text, per_data, org_data, loc_data)
print(annotated_text)

# Save the model
save_rule_based_model(per_data, org_data, loc_data)

[('ئاگا', 'B-PER'), ('لە', 'O'), ('نەتەوە', 'B-ORG'), ('یەکگرتووەکان', 'I-ORG'), ('کار', 'O'), ('دەکات', 'O'), ('و', 'O'), ('لە', 'O'), ('کوردستان', 'B-LOC'), ('دەژی', 'O')]
