In [None]:
import re
import csv
import nltk
import statistics

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords, wordnet
from nltk.translate.bleu_score import sentence_bleu

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


def text_cleaning(text: str) -> str:
    text = str(text)

    text = text.encode("ascii", "ignore").decode()

    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()

    # Tokenization
    words = nltk.word_tokenize(text)

    # POS tagging and Lemmatization
    pos_tags = nltk.pos_tag(words)
    pos_map = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}
    lemmatized_words = [lemmatizer.lemmatize(word, pos_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tags]

    # Stemming
    stemmed_words = [stemmer.stem(word) for word in lemmatized_words]

    return ' '.join(stemmed_words)

In [None]:
column_manual = []
column_llama = []
column_qwen = []
column_qwen_coder = []


with open('input-v2.csv', 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    idx = 0
    for row in csv_reader:
        column_manual.append(text_cleaning(row[1]))
        column_llama.append(text_cleaning(row[2]))
        column_qwen.append(text_cleaning(row[3]))
        column_qwen_coder.append(text_cleaning(row[4]))

In [None]:
def compute_word_stats(columns):
    results = {}
    for model_name, column in columns.items():
        word_counts = [len(sentence.split()) for sentence in column]
        average = sum(word_counts) / len(column)
        median = statistics.median(word_counts)
        results[model_name] = {"average": average, "median": median}
        print(f"{model_name} - Average Words: {average}, Median Words: {median}")
    return results

results_word_stats = compute_word_stats({"Manual": column_manual, "Llama": column_llama, "Qwen": column_qwen, "Qwen-Coder": column_qwen_coder})

In [None]:
from nltk.translate.bleu_score import sentence_bleu

def calculate_avg_median(column_manual, column_model, weights):
    scores = [sentence_bleu([column_manual[i]], column_model[i], weights=weights) for i in range(len(column_manual))]
    average = sum(scores) / len(scores)
    median = statistics.median(scores)
    return average, median, scores

def compute_bleu_scores(column_manual, column_llama, column_qwen, column_qwen_coder):
    models = {
        "Llama": column_llama,
        "Qwen": column_qwen,
        "Qwen-Coder": column_qwen_coder
    }
    weights_list = {
        "BLEU-1": (1.0, 0, 0, 0),
        "BLEU-2": (0.5, 0.5),
        "BLEU-3": (0.33, 0.33, 0.33),
        "BLEU-4": (0.25, 0.25, 0.25, 0.25)
    }

    results = {}
    for model_name, column_model in models.items():
        results[model_name] = {}
        for bleu_name, weights in weights_list.items():
            avg, median, scores = calculate_avg_median(column_manual, column_model, weights)
            results[model_name][bleu_name] = {"average": avg, "median": median, "scores": scores}
            print(f"{model_name} {bleu_name} - Average: {avg}, Median: {median}")

    return results

results = compute_bleu_scores(column_manual, column_llama, column_qwen, column_qwen_coder)

In [None]:
def calculate_bleu_stats(bleu, llama_bleu_scores, qwen_bleu_scores, qwen_coder_bleu_scores, column_manual, column_llama, column_qwen, column_qwen_coder):
    def calculate_filtered_stats(condition):
        filtered_word_counts = {"manual": [], "llama": [], "qwen": [], "qwen_coder": []}

        for i in range(len(llama_bleu_scores)):
            if condition(llama_bleu_scores[i], qwen_bleu_scores[i], qwen_coder_bleu_scores[i]):
                filtered_word_counts["manual"].append(len(column_manual[i].split()))
                filtered_word_counts["llama"].append(len(column_llama[i].split()))
                filtered_word_counts["qwen"].append(len(column_qwen[i].split()))
                filtered_word_counts["qwen_coder"].append(len(column_qwen_coder[i].split()))

        count = len(filtered_word_counts["manual"])
        if count == 0:
            print("No matching entries found.")
            return

        print(count)
        for key in filtered_word_counts:
            print("Average ", key, ":", sum(filtered_word_counts[key]) / count)
            print("Median ", key, ":", statistics.median(filtered_word_counts[key]))

    print(f"## Low BLEU-{bleu} Scores (<0.3)")
    calculate_filtered_stats(lambda l, q, qc: l < 0.3 and q < 0.3 and qc < 0.3)

    print("-----------------------------------------------------")

    print(f"## High BLEU-{bleu} Scores (>0.5)")
    calculate_filtered_stats(lambda l, q, qc: l > 0.5 and q > 0.5 and qc > 0.5)


calculate_bleu_stats(2, results['Llama']['BLEU-2']['scores'], results['Qwen']['BLEU-2']['scores'], results['Qwen-Coder']['BLEU-2']['scores'], column_manual, column_llama, column_qwen, column_qwen_coder)

print("\n#######################################################\n")

calculate_bleu_stats(4, results['Llama']['BLEU-4']['scores'], results['Qwen']['BLEU-4']['scores'], results['Qwen-Coder']['BLEU-4']['scores'], column_manual, column_llama, column_qwen, column_qwen_coder)