In [2]:
import re
import csv
import nltk
import statistics

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords, wordnet
from nltk.translate.bleu_score import sentence_bleu

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [3]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


def text_cleaning(text: str) -> str:
    text = str(text)

    text = text.encode("ascii", "ignore").decode()

    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()

    # Tokenization
    words = nltk.word_tokenize(text)

    # POS tagging and Lemmatization
    pos_tags = nltk.pos_tag(words)
    pos_map = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}
    lemmatized_words = [lemmatizer.lemmatize(word, pos_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tags]

    # Stemming
    stemmed_words = [stemmer.stem(word) for word in lemmatized_words]

    return ' '.join(stemmed_words)

In [4]:
column_manual = []
column_llama = []
column_qwen = []
column_qwen_coder = []


with open('input-v2.csv', 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    idx = 0
    for row in csv_reader:
        column_manual.append(text_cleaning(row[1]))
        column_llama.append(text_cleaning(row[2]))
        column_qwen.append(text_cleaning(row[3]))
        column_qwen_coder.append(text_cleaning(row[4]))

In [5]:
word_counts = [len(sentence.split()) for sentence in column_manual]

average = sum(word_counts) / len(column_manual)
median = statistics.median(word_counts)

print("Manual")
print("Average:", average)
print("Median:", median)

Manual
Average: 10.027027027027026
Median: 6


In [6]:
word_counts = [len(sentence.split()) for sentence in column_llama]

average = sum(word_counts) / len(column_llama)
median = statistics.median(word_counts)

print("Llama")
print("Average:", average)
print("Median:", median)

Llama
Average: 22.075675675675676
Median: 7


In [7]:
word_counts = [len(sentence.split()) for sentence in column_qwen]

average = sum(word_counts) / len(column_qwen)
median = statistics.median(word_counts)

print("Qwen")
print("Average:", average)
print("Median:", median)

Qwen
Average: 27.57837837837838
Median: 4


In [8]:
word_counts = [len(sentence.split()) for sentence in column_qwen_coder]

average = sum(word_counts) / len(column_qwen_coder)
median = statistics.median(word_counts)

print("Qwen-Coder")
print("Average:", average)
print("Median:", median)

Qwen-Coder
Average: 10.102702702702702
Median: 2


In [9]:
# Llama-1
llama_scores = []
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_llama[i], weights=(1.0, 0, 0, 0))
    scores.append(score)

avg_score = sum(scores) / len(scores)
llama_scores.append(avg_score)
print(f"Average BLEU-1 score: {avg_score}")

Average BLEU-1 score: 0.5415688470844894


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [10]:
# Llama-2
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_llama[i], weights=(0.5, 0.5))
    scores.append(score)

llama_bleu_2_scores = scores

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
llama_scores.append(avg_score)
print(f"Average BLEU-2 score: {avg_score}")

Average BLEU-2 score: 0.5010174284570097


In [11]:
# Llama-3
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_llama[i], weights=(0.33, 0.33, 0.33))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
llama_scores.append(avg_score)
print(f"Average BLEU-3 score: {avg_score}")

Average BLEU-3 score: 0.48284531251353496


In [12]:
# Llama-4
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_llama[i], weights=(0.25, 0.25, 0.25, 0.25))
    scores.append(score)

llama_bleu_4_scores = scores

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
llama_scores.append(avg_score)
print(f"Average BLEU-4 score: {avg_score}")

Average BLEU-4 score: 0.4660139828527355


In [13]:
# Qwen-1
scores = []
qwen_scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen[i], weights=(1.0, 0, 0, 0))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_scores.append(avg_score)
print(f"Average BLEU-1 score: {avg_score}")

Average BLEU-1 score: 0.5071700429981955


In [14]:
# Qwen-2
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen[i], weights=(0.5, 0.5))
    scores.append(score)

qwen_bleu_2_scores = scores

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_scores.append(avg_score)
print(f"Average BLEU-2 score: {avg_score}")

Average BLEU-2 score: 0.46872755290887036


In [15]:
# Qwen-3
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen[i], weights=(0.33, 0.33, 0.33))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_scores.append(avg_score)
print(f"Average BLEU-3 score: {avg_score}")

Average BLEU-3 score: 0.45643686928477883


In [16]:
# Qwen-4
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen[i], weights=(0.25, 0.25, 0.25, 0.25))
    scores.append(score)

qwen_bleu_4_scores = scores

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_scores.append(avg_score)
print(f"Average BLEU-4 score: {avg_score}")

Average BLEU-4 score: 0.4435962706194622


In [17]:
# Qwen-Coder-1
qwen_coder_scores = []
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen_coder[i], weights=(1.0, 0, 0, 0))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_coder_scores.append(avg_score)
print(f"Average BLEU-1 score: {avg_score}")

Average BLEU-1 score: 0.5378008598266991


In [18]:
# Qwen-Coder-2
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen_coder[i], weights=(0.5, 0.5))
    scores.append(score)

qwen_coder_bleu_2_scores = scores

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_coder_scores.append(avg_score)
print(f"Average BLEU-2 score: {avg_score}")

Average BLEU-2 score: 0.4926292567239457


In [19]:
# Qwen-Coder-3
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen_coder[i], weights=(0.33, 0.33, 0.33))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_coder_scores.append(avg_score)
print(f"Average BLEU-3 score: {avg_score}")

Average BLEU-3 score: 0.47946698002653043


In [20]:
# Qwen-Coder-4
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen_coder[i], weights=(0.25, 0.25, 0.25, 0.25))
    scores.append(score)

qwen_coder_bleu_4_scores = scores

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_coder_scores.append(avg_score)
print(f"Average BLEU-4 score: {avg_score}")

Average BLEU-4 score: 0.4690538657544767


In [21]:
print(llama_scores)
print(qwen_scores)
print(qwen_coder_scores)

[0.5415688470844894, 0.5010174284570097, 0.48284531251353496, 0.4660139828527355]
[0.5071700429981955, 0.46872755290887036, 0.45643686928477883, 0.4435962706194622]
[0.5378008598266991, 0.4926292567239457, 0.47946698002653043, 0.4690538657544767]


In [22]:
def calculate_bleu_stats(bleu, llama_bleu_scores, qwen_bleu_scores, qwen_coder_bleu_scores, column_manual, column_llama, column_qwen, column_qwen_coder):
    def calculate_filtered_stats(condition):
        filtered_word_counts = {"manual": [], "llama": [], "qwen": [], "qwen_coder": []}

        for i in range(len(llama_bleu_scores)):
            if condition(llama_bleu_scores[i], qwen_bleu_scores[i], qwen_coder_bleu_scores[i]):
                filtered_word_counts["manual"].append(len(column_manual[i].split()))
                filtered_word_counts["llama"].append(len(column_llama[i].split()))
                filtered_word_counts["qwen"].append(len(column_qwen[i].split()))
                filtered_word_counts["qwen_coder"].append(len(column_qwen_coder[i].split()))

        count = len(filtered_word_counts["manual"])
        if count == 0:
            print("No matching entries found.")
            return

        print(count)
        for key in filtered_word_counts:
            print("Average ", key, ":", sum(filtered_word_counts[key]) / count)
            print("Median ", key, ":", statistics.median(filtered_word_counts[key]))

    print(f"## Low BLEU-{bleu} Scores (<0.3)")
    calculate_filtered_stats(lambda l, q, qc: l < 0.3 and q < 0.3 and qc < 0.3)

    print("-----------------------------------------------------")

    print(f"## High BLEU-{bleu} Scores (>0.5)")
    calculate_filtered_stats(lambda l, q, qc: l > 0.5 and q > 0.5 and qc > 0.5)


calculate_bleu_stats(2, llama_bleu_2_scores, qwen_bleu_2_scores, qwen_coder_bleu_2_scores, column_manual, column_llama, column_qwen, column_qwen_coder)

print("\n#######################################################\n")

calculate_bleu_stats(4, llama_bleu_4_scores, qwen_bleu_4_scores, qwen_coder_bleu_4_scores, column_manual, column_llama, column_qwen, column_qwen_coder)

## Low BLEU-2 Scores (<0.3)
51
Average  manual : 13.450980392156863
Median  manual : 13
Average  llama : 40.90196078431372
Median  llama : 6
Average  qwen : 46.15686274509804
Median  qwen : 6
Average  qwen_coder : 20.058823529411764
Median  qwen_coder : 2
-----------------------------------------------------
## High BLEU-2 Scores (>0.5)
52
Average  manual : 5.769230769230769
Median  manual : 2.0
Average  llama : 6.269230769230769
Median  llama : 2.0
Average  qwen : 6.076923076923077
Median  qwen : 2.0
Average  qwen_coder : 5.923076923076923
Median  qwen_coder : 2.0

#######################################################

## Low BLEU-4 Scores (<0.3)
57
Average  manual : 17.017543859649123
Median  manual : 14
Average  llama : 42.19298245614035
Median  llama : 7
Average  qwen : 44.280701754385966
Median  qwen : 7
Average  qwen_coder : 18.42105263157895
Median  qwen_coder : 2
-----------------------------------------------------
## High BLEU-4 Scores (>0.5)
50
Average  manual : 5.78
Media