In [2]:
import re
import csv
import nltk
import statistics

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords, wordnet
from nltk.translate.bleu_score import sentence_bleu

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [3]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


def text_cleaning(text: str) -> str:
    text = str(text)

    text = text.encode("ascii", "ignore").decode()

    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()

    # Tokenization
    words = nltk.word_tokenize(text)

    # POS tagging and Lemmatization
    pos_tags = nltk.pos_tag(words)
    pos_map = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}
    lemmatized_words = [lemmatizer.lemmatize(word, pos_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tags]

    # Stemming
    stemmed_words = [stemmer.stem(word) for word in lemmatized_words]

    return ' '.join(stemmed_words)

In [4]:
column_manual = []
column_llama = []
column_qwen = []
column_qwen_coder = []


with open('input-v1.csv', 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    idx = 0
    for row in csv_reader:
        column_manual.append(text_cleaning(row[1]))
        column_llama.append(text_cleaning(row[2]))
        column_qwen.append(text_cleaning(row[3]))
        column_qwen_coder.append(text_cleaning(row[4]))

In [5]:
def compute_word_stats(columns):
    results = {}
    for model_name, column in columns.items():
        word_counts = [len(sentence.split()) for sentence in column]
        average = sum(word_counts) / len(column)
        median = statistics.median(word_counts)
        results[model_name] = {"average": average, "median": median}
        print(f"{model_name} - Average Words: {average}, Median Words: {median}")
    return results

results_word_stats = compute_word_stats({"Manual": column_manual, "Llama": column_llama, "Qwen": column_qwen, "Qwen-Coder": column_qwen_coder})

Manual - Average Words: 8.58252427184466, Median Words: 5.5
Llama - Average Words: 27.87864077669903, Median Words: 11.5
Qwen - Average Words: 14.266990291262136, Median Words: 2.5
Qwen-Coder - Average Words: 9.37864077669903, Median Words: 0.0


In [6]:
from nltk.translate.bleu_score import sentence_bleu

def calculate_avg_median(column_manual, column_model, weights):
    scores = [sentence_bleu([column_manual[i]], column_model[i], weights=weights) for i in range(len(column_manual))]
    average = sum(scores) / len(scores)
    median = statistics.median(scores)
    return average, median, scores

def compute_bleu_scores(column_manual, column_llama, column_qwen, column_qwen_coder):
    models = {
        "Llama": column_llama,
        "Qwen": column_qwen,
        "Qwen-Coder": column_qwen_coder
    }
    weights_list = {
        "BLEU-1": (1.0, 0, 0, 0),
        "BLEU-2": (0.5, 0.5),
        "BLEU-3": (0.33, 0.33, 0.33),
        "BLEU-4": (0.25, 0.25, 0.25, 0.25)
    }

    results = {}
    for model_name, column_model in models.items():
        results[model_name] = {}
        for bleu_name, weights in weights_list.items():
            avg, median, scores = calculate_avg_median(column_manual, column_model, weights)
            results[model_name][bleu_name] = {"average": avg, "median": median, "scores": scores}
            print(f"{model_name} {bleu_name} - Average: {avg}, Median: {median}")

    return results

results = compute_bleu_scores(column_manual, column_llama, column_qwen, column_qwen_coder)

Llama BLEU-1 - Average: 0.38870794788165225, Median: 0.2058920887123469
Llama BLEU-2 - Average: 0.3736304205514204, Median: 0.17588800418798836


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Llama BLEU-3 - Average: 0.36707843454630623, Median: 0.16743201732252183
Llama BLEU-4 - Average: 0.3553639246232379, Median: 0.1481395615151846
Qwen BLEU-1 - Average: 0.35986402352747426, Median: 0.0
Qwen BLEU-2 - Average: 0.3532078173804647, Median: 0.0
Qwen BLEU-3 - Average: 0.34836831071746893, Median: 0.0
Qwen BLEU-4 - Average: 0.3399081574608633, Median: 0.0
Qwen-Coder BLEU-1 - Average: 0.2918088141848435, Median: 0.0
Qwen-Coder BLEU-2 - Average: 0.2844759601774713, Median: 0.0
Qwen-Coder BLEU-3 - Average: 0.27998196190208696, Median: 0.0
Qwen-Coder BLEU-4 - Average: 0.276757810163761, Median: 0.0


In [7]:
def calculate_bleu_stats(bleu, llama_bleu_scores, qwen_bleu_scores, qwen_coder_bleu_scores, column_manual, column_llama, column_qwen, column_qwen_coder):
    def calculate_filtered_stats(condition):
        filtered_word_counts = {"manual": [], "llama": [], "qwen": [], "qwen_coder": []}

        for i in range(len(llama_bleu_scores)):
            if condition(llama_bleu_scores[i], qwen_bleu_scores[i], qwen_coder_bleu_scores[i]):
                filtered_word_counts["manual"].append(len(column_manual[i].split()))
                filtered_word_counts["llama"].append(len(column_llama[i].split()))
                filtered_word_counts["qwen"].append(len(column_qwen[i].split()))
                filtered_word_counts["qwen_coder"].append(len(column_qwen_coder[i].split()))

        count = len(filtered_word_counts["manual"])
        if count == 0:
            print("No matching entries found.")
            return

        print(count)
        for key in filtered_word_counts:
            print("Average ", key, ":", sum(filtered_word_counts[key]) / count)
            print("Median ", key, ":", statistics.median(filtered_word_counts[key]))

    print(f"## Low BLEU-{bleu} Scores (<0.3)")
    calculate_filtered_stats(lambda l, q, qc: l < 0.3 and q < 0.3 and qc < 0.3)

    print("-----------------------------------------------------")

    print(f"## High BLEU-{bleu} Scores (>0.5)")
    calculate_filtered_stats(lambda l, q, qc: l > 0.5 and q > 0.5 and qc > 0.5)


calculate_bleu_stats(2, results['Llama']['BLEU-2']['scores'], results['Qwen']['BLEU-2']['scores'], results['Qwen-Coder']['BLEU-2']['scores'], column_manual, column_llama, column_qwen, column_qwen_coder)

print("\n#######################################################\n")

calculate_bleu_stats(4, results['Llama']['BLEU-4']['scores'], results['Qwen']['BLEU-4']['scores'], results['Qwen-Coder']['BLEU-4']['scores'], column_manual, column_llama, column_qwen, column_qwen_coder)

## Low BLEU-2 Scores (<0.3)
98
Average  manual : 3.8877551020408165
Median  manual : 0.0
Average  llama : 36.03061224489796
Median  llama : 4.5
Average  qwen : 15.60204081632653
Median  qwen : 0.0
Average  qwen_coder : 10.60204081632653
Median  qwen_coder : 0.0
-----------------------------------------------------
## High BLEU-2 Scores (>0.5)
49
Average  manual : 11.877551020408163
Median  manual : 9
Average  llama : 13.306122448979592
Median  llama : 10
Average  qwen : 12.489795918367347
Median  qwen : 9
Average  qwen_coder : 12.244897959183673
Median  qwen_coder : 10

#######################################################

## Low BLEU-4 Scores (<0.3)
102
Average  manual : 4.147058823529412
Median  manual : 0.0
Average  llama : 35.833333333333336
Median  llama : 5.0
Average  qwen : 15.872549019607844
Median  qwen : 0.0
Average  qwen_coder : 10.264705882352942
Median  qwen_coder : 0.0
-----------------------------------------------------
## High BLEU-4 Scores (>0.5)
46
Average  manual