In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive

Mounted at /content/drive
/content/drive/MyDrive


In [2]:
import re
import csv
import nltk
import statistics

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords, wordnet
from nltk.translate.bleu_score import sentence_bleu

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [3]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


def text_cleaning(text: str) -> str:
    text = str(text)

    text = text.encode("ascii", "ignore").decode()

    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()

    # Tokenization
    words = nltk.word_tokenize(text)

    # POS tagging and Lemmatization
    pos_tags = nltk.pos_tag(words)
    pos_map = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}
    lemmatized_words = [lemmatizer.lemmatize(word, pos_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tags]

    # Stemming
    stemmed_words = [stemmer.stem(word) for word in lemmatized_words]

    return ' '.join(stemmed_words)

In [4]:
column_manual = []
column_llama = []
column_qwen = []
column_qwen_coder = []


with open('input-v2.csv', 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    idx = 0
    for row in csv_reader:
        column_manual.append(text_cleaning(row[1]))
        column_llama.append(text_cleaning(row[2]))
        column_qwen.append(text_cleaning(row[3]))
        column_qwen_coder.append(text_cleaning(row[4]))

In [5]:
def compute_word_stats(columns):
    results = {}
    for model_name, column in columns.items():
        word_counts = [len(sentence.split()) for sentence in column]
        average = sum(word_counts) / len(column)
        median = statistics.median(word_counts)
        results[model_name] = {"average": average, "median": median}
        print(f"{model_name} - Average Words: {average}, Median Words: {median}")
    return results

results_word_stats = compute_word_stats({"Manual": column_manual, "Llama": column_llama, "Qwen": column_qwen, "Qwen-Coder": column_qwen_coder})

Manual - Average Words: 9.562162162162162, Median Words: 6
Llama - Average Words: 21.686486486486487, Median Words: 7
Qwen - Average Words: 26.216216216216218, Median Words: 4
Qwen-Coder - Average Words: 9.172972972972973, Median Words: 2


In [6]:
from nltk.translate.bleu_score import sentence_bleu

def calculate_avg_median(column_manual, column_model, weights):
    scores = [sentence_bleu([column_manual[i]], column_model[i], weights=weights) for i in range(len(column_manual))]
    average = sum(scores) / len(scores)
    median = statistics.median(scores)
    return average, median, scores

def compute_bleu_scores(column_manual, column_llama, column_qwen, column_qwen_coder):
    models = {
        "Llama": column_llama,
        "Qwen": column_qwen,
        "Qwen-Coder": column_qwen_coder
    }
    weights_list = {
        "BLEU-1": (1.0, 0, 0, 0),
        "BLEU-2": (0.5, 0.5),
        "BLEU-3": (0.33, 0.33, 0.33),
        "BLEU-4": (0.25, 0.25, 0.25, 0.25)
    }

    results = {}
    for model_name, column_model in models.items():
        results[model_name] = {}
        for bleu_name, weights in weights_list.items():
            avg, median, scores = calculate_avg_median(column_manual, column_model, weights)
            results[model_name][bleu_name] = {"average": avg, "median": median, "scores": scores}
            print(f"{model_name} {bleu_name} - Average: {avg}, Median: {median}")

    return results

results = compute_bleu_scores(column_manual, column_llama, column_qwen, column_qwen_coder)

Llama BLEU-1 - Average: 0.3761302290713021, Median: 0.2564102564102564
Llama BLEU-2 - Average: 0.3542156929620382, Median: 0.20341905108624314


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Llama BLEU-3 - Average: 0.33926807259834274, Median: 0.15848174614372246
Llama BLEU-4 - Average: 0.3226516964486028, Median: 0.11393021866798646
Qwen BLEU-1 - Average: 0.26502190859506336, Median: 0.03605965440426316
Qwen BLEU-2 - Average: 0.2518334127869017, Median: 0.028541978659718838
Qwen BLEU-3 - Average: 0.24017202412291924, Median: 0.011543196283243247
Qwen BLEU-4 - Average: 0.22734022792602443, Median: 1.186881559607099e-78
Qwen-Coder BLEU-1 - Average: 0.3162565759690838, Median: 0
Qwen-Coder BLEU-2 - Average: 0.30394844185169995, Median: 0
Qwen-Coder BLEU-3 - Average: 0.2946234620390299, Median: 0
Qwen-Coder BLEU-4 - Average: 0.2851266338480871, Median: 0


In [7]:
def calculate_bleu_stats(bleu, llama_bleu_scores, qwen_bleu_scores, qwen_coder_bleu_scores, column_manual, column_llama, column_qwen, column_qwen_coder):
    def calculate_filtered_stats(condition):
        filtered_word_counts = {"manual": [], "llama": [], "qwen": [], "qwen_coder": []}

        for i in range(len(llama_bleu_scores)):
            if condition(llama_bleu_scores[i], qwen_bleu_scores[i], qwen_coder_bleu_scores[i]):
                filtered_word_counts["manual"].append(len(column_manual[i].split()))
                filtered_word_counts["llama"].append(len(column_llama[i].split()))
                filtered_word_counts["qwen"].append(len(column_qwen[i].split()))
                filtered_word_counts["qwen_coder"].append(len(column_qwen_coder[i].split()))

        count = len(filtered_word_counts["manual"])
        if count == 0:
            print("No matching entries found.")
            return

        print(count)
        for key in filtered_word_counts:
            print("Average ", key, ":", sum(filtered_word_counts[key]) / count)
            print("Median ", key, ":", statistics.median(filtered_word_counts[key]))

    print(f"## Low BLEU-{bleu} Scores (<0.3)")
    calculate_filtered_stats(lambda l, q, qc: l < 0.3 and q < 0.3 and qc < 0.3)

    print("-----------------------------------------------------")

    print(f"## High BLEU-{bleu} Scores (>0.5)")
    calculate_filtered_stats(lambda l, q, qc: l > 0.5 and q > 0.5 and qc > 0.5)


calculate_bleu_stats(2, results['Llama']['BLEU-2']['scores'], results['Qwen']['BLEU-2']['scores'], results['Qwen-Coder']['BLEU-2']['scores'], column_manual, column_llama, column_qwen, column_qwen_coder)

print("\n#######################################################\n")

calculate_bleu_stats(4, results['Llama']['BLEU-4']['scores'], results['Qwen']['BLEU-4']['scores'], results['Qwen-Coder']['BLEU-4']['scores'], column_manual, column_llama, column_qwen, column_qwen_coder)

## Low BLEU-2 Scores (<0.3)
93
Average  manual : 7.354838709677419
Median  manual : 3
Average  llama : 24.731182795698924
Median  llama : 2
Average  qwen : 25.473118279569892
Median  qwen : 0
Average  qwen_coder : 11.129032258064516
Median  qwen_coder : 0
-----------------------------------------------------
## High BLEU-2 Scores (>0.5)
26
Average  manual : 9.538461538461538
Median  manual : 7.0
Average  llama : 10.461538461538462
Median  llama : 8.5
Average  qwen : 10.153846153846153
Median  qwen : 7.0
Average  qwen_coder : 9.846153846153847
Median  qwen_coder : 8.0

#######################################################

## Low BLEU-4 Scores (<0.3)
99
Average  manual : 9.777777777777779
Median  manual : 4
Average  llama : 26.454545454545453
Median  llama : 2
Average  qwen : 25.585858585858585
Median  qwen : 0
Average  qwen_coder : 10.707070707070708
Median  qwen_coder : 0
-----------------------------------------------------
## High BLEU-4 Scores (>0.5)
24
Average  manual : 9.875
Me