In [5]:
import pandas as pd
from hashlib import sha256
from difflib import SequenceMatcher

# Function to compute a hash of the string
def compute_hash(text):
    return sha256(text.encode('utf-8')).hexdigest()

# Function to check for a match with a threshold using token set intersection
def is_token_set_match(a, b, threshold=0.9):
    tokens_a = set(a.split())
    tokens_b = set(b.split())
    intersection = tokens_a.intersection(tokens_b)
    if len(intersection) / max(len(tokens_a), len(tokens_b)) >= threshold:
        return SequenceMatcher(None, a, b).ratio() >= threshold
    return False

# Load the TA1_TA2_Annotations dataset
ta_annotations = pd.read_csv('data/TA1_TA2_Annotations.csv')

# Filter the dataset to only include rows with the 'Humour/Sarcasm' strategy
ta_humour_sarcasm = ta_annotations[ta_annotations['Majority'] == 'Humour/Sarcasm'].copy()

# Normalize the text for comparison
ta_humour_sarcasm.loc[:, 'COUNTER_NARRATIVE'] = ta_humour_sarcasm['COUNTER_NARRATIVE'].str.lower().str.strip()

# Compute hashes for the humour/sarcasm counter narratives
ta_humour_sarcasm.loc[:, 'CN_HASH'] = ta_humour_sarcasm['COUNTER_NARRATIVE'].apply(compute_hash)

# Load and normalize the iconan and iconanv2 datasets
intentconan_train = pd.read_csv('data/iconan/iconan_train.csv')
intentconan_test = pd.read_csv('data/iconan/iconan_test.csv')
intentconan_dev = pd.read_csv('data/iconan/iconan_dev.csv')
intentconanv2_train = pd.read_csv('data/iconanv2/train.csv')
intentconanv2_test = pd.read_csv('data/iconanv2/test.csv')
intentconanv2_val = pd.read_csv('data/iconanv2/val.csv')

iconan = pd.concat([intentconan_train, intentconan_test, intentconan_dev])
intentconanv2 = pd.concat([intentconanv2_train, intentconanv2_test, intentconanv2_val])

# Normalize text in the iconan datasets
iconan['counterSpeech'] = iconan['counterSpeech'].str.lower().str.strip()
intentconanv2['counterspeech'] = intentconanv2['counterspeech'].str.lower().str.strip()

# Compute hashes for fast comparison
iconan['CS_HASH'] = iconan['counterSpeech'].apply(compute_hash)
intentconanv2['CS_HASH'] = intentconanv2['counterspeech'].apply(compute_hash)

# Prepare to store matches
matches = []

# Check for matches in ICONAN
for idx, row in ta_humour_sarcasm.iterrows():
    cn_hash = row['CN_HASH']
    for _, iconan_row in iconan[iconan['CS_HASH'] == cn_hash].iterrows():
        if is_token_set_match(row['COUNTER_NARRATIVE'], iconan_row['counterSpeech']):
            matches.append({
                'source': 'ICONAN',
                'HATE_SPEECH': row['HATE_SPEECH'],
                'COUNTER_NARRATIVE': row['COUNTER_NARRATIVE'],
                'Matched_counterSpeech': iconan_row['counterSpeech'],
                'strategy': iconan_row['cnType']  # Assuming cnType is the strategy column
            })

# Check for matches in intentconanv2
for idx, row in ta_humour_sarcasm.iterrows():
    cn_hash = row['CN_HASH']
    for _, iconanv2_row in intentconanv2[intentconanv2['CS_HASH'] == cn_hash].iterrows():
        if is_token_set_match(row['COUNTER_NARRATIVE'], iconanv2_row['counterspeech']):
            matches.append({
                'source': 'intentconanv2',
                'HATE_SPEECH': row['HATE_SPEECH'],
                'COUNTER_NARRATIVE': row['COUNTER_NARRATIVE'],
                'Matched_counterSpeech': iconanv2_row['counterspeech'],
                'strategy': iconanv2_row['csType']  # Assuming csType is the strategy column
            })

# Convert matches to DataFrame
matches_df = pd.DataFrame(matches)

# Save matches to a CSV file
matches_df.to_csv('data/humour_sarcasm_matches.csv', index=False)


