In [1]:
import json
import nltk
import csv
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from scipy.stats import ttest_ind

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Load JSON data
with open('E:/updatetask1/hiontoly/surgury/combinedontology1.json', 'r', encoding='utf-8') as file:
    json_data = json.load(file)

# Function to map Arabic words to English
def find_english_equivalent(arabic_word, json_data):
    for english_word, data in json_data.items():
        if "semantic_relations" in data and "synonyms" in data["semantic_relations"]:
            if arabic_word in data["semantic_relations"]["synonyms"]:
                return english_word
    return None

def tokenize_arabic(phrase):
    return word_tokenize(phrase)

def find_english_equivalents(arabic_words, json_data):
    english_equivalents = []
    for word in arabic_words:
        english_word = find_english_equivalent(word, json_data)
        if english_word:
            english_equivalents.append(english_word)
    return english_equivalents

# Function to convert tagged words to synsets
def tagged_to_synset(word, tag):
    wn_tag = {'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ}.get(tag[0].upper(), None)
    if not wn_tag:
        return None
    return wn.synsets(word, wn_tag)[0] if wn.synsets(word, wn_tag) else None

# Function to calculate sentence similarity
def sentence_similarity(sentence1, sentence2):
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))

    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]

    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]

    score, count = 0.0, 0

    for synset in synsets1:
        best_scores = [synset.path_similarity(ss) for ss in synsets2 if ss]
        best_score = max(best_scores) if best_scores else None

        if best_score is not None:
            score += best_score
            count += 1

    if count == 0:
        return 0

    score /= count
    return score

# Lists to store similarity scores
similarity_scores_similar = []
similarity_scores_not_similar = []

# Function to evaluate similarity from a CSV file and store scores
def evaluate_similarity_from_csv(csv_file_path, threshold=0.5):
    TP = FP = FN = TN = 0
    with open(csv_file_path, encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            arabic_phrase1 = row['question1']
            arabic_phrase2 = row['question2']
            actual_label = int(row['label'])

            arabic_words1 = tokenize_arabic(arabic_phrase1)
            arabic_words2 = tokenize_arabic(arabic_phrase2)

            english_sentence1 = " ".join(find_english_equivalents(arabic_words1, json_data))
            english_sentence2 = " ".join(find_english_equivalents(arabic_words2, json_data))

            similarity_score = sentence_similarity(english_sentence1, english_sentence2)
            predicted_label = 1 if similarity_score >= threshold else 0

            if predicted_label == actual_label == 1:
                TP += 1
            elif predicted_label == 1 and actual_label == 0:
                FP += 1
            elif predicted_label == 0 and actual_label == 1:
                FN += 1
            elif predicted_label == actual_label == 0:
                TN += 1

            # Store similarity scores
            if actual_label == 1:
                similarity_scores_similar.append(similarity_score)
            else:
                similarity_scores_not_similar.append(similarity_score)

            # Stop after processing a certain number of rows
            if TP + FP + FN + TN >= 15717:  # Adjust this number as needed
                break

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (TP + TN) / (TP + FP + FN + TN) if (TP + FP + FN + TN) > 0 else 0

    return precision, recall, f1_score, accuracy

# Path to your CSV file
csv_file_path = 'E:/coding/constructwordnet2/constrcutwordnet2/AREP 12 REALTION/new work bit wordnet and dataset/dataset/up/combined.csv'

# Evaluate similarity from CSV
precision, recall, f1_score, accuracy = evaluate_similarity_from_csv(csv_file_path)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")
print(f"Accuracy: {accuracy}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Orbit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Orbit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Orbit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Precision: 0.5757296466973887
Recall: 0.7902727017149284
F1-Score: 0.6661532081284436
Accuracy: 0.6411513723492326
