In [7]:
import pandas as pd
import numpy as np
from transformers import pipeline
import pycountry
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm

In [5]:
df=pd.read_csv('Arab-Spring-Paper/Once more/final.csv')
df['Text']=df['Title']+' '+df['Abstract']
df['union_annotation'] = df['union_annotation'].apply(lambda x: eval(x) if isinstance(x, str) else x)
df['intersection_annotation'] = df['intersection_annotation'].apply(lambda x: eval(x) if isinstance(x, str) else x)

In [8]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dslim/distilbert-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/distilbert-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

# Function to extract locations and filter by country names
def extract_and_filter_locations(text):
    ner_results = nlp(text)
    locations = [
        entity["word"]
        for entity in ner_results
         if ( entity["entity"] == "B-LOC" or entity["entity"] == "I-LOC")
    ]

    final=coco.convert(list(set(locations)), to='ISO3') 
    if type(final)==str:
        final=[final]

    final=[x.lower() for x in final if x!='not found']
   
    return final

# Apply the function to the DataFrame with tqdm
tqdm.pandas()  # Initialize tqdm for pandas
import logging
import country_converter as coco
coco_logger = coco.logging.getLogger()
coco_logger.setLevel(logging.CRITICAL)
df["locations"] = df["Text"].progress_apply(extract_and_filter_locations)
# Display the result


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [21:20<00:00,  1.28s/it]


In [None]:
print('overall accuracy (union)', sum(df['locations']==df['union_annotation'])/1000)
print('overall accuracy (intersection)', sum(df['locations']==df['intersection_annotation'])/1000)

overall accuracy (union) 0.728
overall accuracy (intersection) 0.716


In [None]:
df.to_csv('Transformer/transformer1.csv', index=False)

In [None]:
sample_accuracies = df.groupby("SampleGroup").apply(
    lambda x: pd.Series({
        "accuracy_union": (x.apply(lambda row: set(row['locations']) == set(row['union_annotation']), axis=1).mean()),
        "accuracy_intersection": (x.apply(lambda row: set(row['locations']) == set(row['intersection_annotation']), axis=1).mean())
    })
).reset_index()

sample_accuracies = sample_accuracies.sort_values(by='accuracy_union').reset_index(drop=True)
sample_accuracies

  sample_accuracies = df.groupby("SampleGroup").apply(


Unnamed: 0,SampleGroup,accuracy_union,accuracy_intersection
0,with_mention_arab,0.6,0.58
1,with_mention,0.617857,0.567857
2,field_20,0.880769,0.884615


In [12]:
sample_accuracies = df.groupby("SampleGroup").apply(
    lambda x: pd.Series({
        "jaccard_union": (
            x.apply(
                lambda row: len(set(row['locations']).intersection(set(row['union_annotation']))) /
                            len(set(row['locations']).union(set(row['union_annotation'])))
                if len(set(row['locations']).union(set(row['union_annotation']))) > 0 else 1,
                axis=1
            ).mean()
        ),
        "jaccard_intersection": (
            x.apply(
                lambda row: len(set(row['locations']).intersection(set(row['intersection_annotation']))) /
                            len(set(row['locations']).union(set(row['intersection_annotation'])))
                if len(set(row['locations']).union(set(row['intersection_annotation']))) > 0 else 1,
                axis=1
            ).mean()
        )
    })
).reset_index()

sample_accuracies = sample_accuracies.sort_values(by='jaccard_union').reset_index(drop=True)
sample_accuracies

  sample_accuracies = df.groupby("SampleGroup").apply(


Unnamed: 0,SampleGroup,jaccard_union,jaccard_intersection
0,with_mention,0.707738,0.660774
1,with_mention_arab,0.774679,0.724198
2,field_20,0.897788,0.898109


In [13]:
sample_recalls = df.groupby("SampleGroup").apply(
    lambda x: pd.Series({
        "recall_union": (
            x.apply(
                lambda row: len(set(row['locations']) & set(row['union_annotation'])) / len(set(row['union_annotation'])) 
                if len(set(row['union_annotation'])) > 0 else 1,
                axis=1
            ).mean()
        ),
        "recall_intersection": (
            x.apply(
                lambda row: len(set(row['locations']) & set(row['intersection_annotation'])) / len(set(row['intersection_annotation'])) 
                if len(set(row['intersection_annotation'])) > 0 else 1,
                axis=1
            ).mean()
        )
    })
).reset_index()

sample_recalls = sample_recalls.sort_values(by='recall_union').reset_index(drop=True)
sample_recalls


  sample_recalls = df.groupby("SampleGroup").apply(


Unnamed: 0,SampleGroup,recall_union,recall_intersection
0,with_mention,0.787202,0.844583
1,with_mention_arab,0.925125,0.953154
2,field_20,0.943269,0.966346


In [16]:
field_20 = df.query("SampleGroup == 'field_20'")

# Compute TP, TN, FP, FN for each row
metrics = field_20.apply(lambda row: pd.Series({
    "TP": int(set(row['locations']) == set(row['union_annotation']) and len(row['union_annotation']) > 0),
    "TN": int(len(row['union_annotation']) == 0 and len(row['locations']) == 0),
    "FP": int(len(row['union_annotation']) == 0 and len(row['locations']) > 0),
    "FN": int(len(row['union_annotation']) > 0 and len(row['locations']) == 0)
}), axis=1)

# Summarize metrics
total_metrics = metrics.sum()

# Calculate precision, recall, and F1 score
precision = total_metrics["TP"] / (total_metrics["TP"] + total_metrics["FP"]) if (total_metrics["TP"] + total_metrics["FP"]) > 0 else 0
recall = total_metrics["TP"] / (total_metrics["TP"] + total_metrics["FN"]) if (total_metrics["TP"] + total_metrics["FN"]) > 0 else 0
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Output results
{
    "Precision": precision,
    "Recall": recall,
    "F1 Score": f1_score
}

{'Precision': 0.746268656716418,
 'Recall': 0.6578947368421053,
 'F1 Score': 0.6993006993006994}

In [17]:
sample_recalls = df.groupby("SampleGroup").apply(
    lambda x: pd.Series({
        "recall_union": (
            x.apply(
                lambda row: pd.Series({
                    "TP": int(set(row['locations']) == set(row['union_annotation']) and len(row['union_annotation']) > 0),
                    "FN": int(len(row['union_annotation']) > 0 and len(row['locations']) == 0)
                }),
                axis=1
            ).sum(axis=0)
        )
    }).apply(
        lambda metrics: metrics["TP"] / (metrics["TP"] + metrics["FN"]) if (metrics["TP"] + metrics["FN"]) > 0 else 0
    )
).reset_index()

sample_recalls = sample_recalls.sort_values(by='recall_union').reset_index(drop=True)
sample_recalls


  sample_recalls = df.groupby("SampleGroup").apply(


Unnamed: 0,SampleGroup,recall_union
0,field_20,0.657895
1,with_mention,0.791667
2,with_mention_arab,0.939655
