In [None]:
import re
import csv
import nltk
import string
import numpy as np
import pandas as pd

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords, wordnet
from nltk.translate.bleu_score import sentence_bleu

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


def text_cleaning(text: str) -> str:
    text = str(text)

    text = text.encode("ascii", "ignore").decode()

    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()

    # Tokenization
    words = nltk.word_tokenize(text)

    # POS tagging and Lemmatization
    pos_tags = nltk.pos_tag(words)
    pos_map = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}
    lemmatized_words = [lemmatizer.lemmatize(word, pos_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tags]

    # Stemming
    stemmed_words = [stemmer.stem(word) for word in lemmatized_words]

    return ' '.join(stemmed_words)

In [None]:
column_manual = []
column_llama = []
column_qwen = []
column_qwen_coder = []


with open('input.csv', 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    idx = 0
    for row in csv_reader:
        column_manual.append(text_cleaning(row[1]))
        column_llama.append(text_cleaning(row[2]))
        column_qwen.append(text_cleaning(row[3]))
        column_qwen_coder.append(text_cleaning(row[4]))

In [None]:
word_counter = 0
for sentence in column_manual:
    word_counter = word_counter + len(sentence.split())
print(word_counter/len(column_llama))

10.016216216216216


In [None]:
word_counter = 0
for sentence in column_llama:
    word_counter = word_counter + len(sentence.split())
print(word_counter/len(column_llama))

25.535135135135135


In [None]:
word_counter = 0
for sentence in column_qwen:
    word_counter = word_counter + len(sentence.split())
print(word_counter/len(column_llama))

14.416216216216217


In [None]:
word_counter = 0
for sentence in column_qwen_coder:
    word_counter = word_counter + len(sentence.split())
print(word_counter/len(column_llama))

9.92972972972973


In [None]:
# Llama-1
llama_scores = []
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_llama[i], weights=(1.0, 0, 0, 0))
    scores.append(score)

avg_score = sum(scores) / len(scores)
llama_scores.append(avg_score)
print(f"Average BLEU-1 score: {avg_score}")

Average BLEU-1 score: 0.5702390962471942


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
# Llama-2
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_llama[i], weights=(0.5, 0.5))
    scores.append(score)

llama_bleu_2_scores = scores

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
llama_scores.append(avg_score)
print(f"Average BLEU-2 score: {avg_score}")

Average BLEU-2 score: 0.538832231561496


In [None]:
# Llama-3
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_llama[i], weights=(0.33, 0.33, 0.33))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
llama_scores.append(avg_score)
print(f"Average BLEU-3 score: {avg_score}")

Average BLEU-3 score: 0.5247299625967312


In [None]:
# Llama-4
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_llama[i], weights=(0.25, 0.25, 0.25, 0.25))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
llama_scores.append(avg_score)
print(f"Average BLEU-4 score: {avg_score}")

Average BLEU-4 score: 0.5096852104178654


In [None]:
# Qwen-1
scores = []
qwen_scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen[i], weights=(1.0, 0, 0, 0))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_scores.append(avg_score)
print(f"Average BLEU-1 score: {avg_score}")

Average BLEU-1 score: 0.5666314450753404


In [None]:
# Qwen-2
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen[i], weights=(0.5, 0.5))
    scores.append(score)

qwen_bleu_2_scores = scores

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_scores.append(avg_score)
print(f"Average BLEU-2 score: {avg_score}")

Average BLEU-2 score: 0.527431303558279


In [None]:
# Qwen-3
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen[i], weights=(0.33, 0.33, 0.33))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_scores.append(avg_score)
print(f"Average BLEU-3 score: {avg_score}")

Average BLEU-3 score: 0.5191499526212822


In [None]:
# Qwen-4
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen[i], weights=(0.25, 0.25, 0.25, 0.25))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_scores.append(avg_score)
print(f"Average BLEU-4 score: {avg_score}")

Average BLEU-4 score: 0.5096336933477369


In [None]:
# Qwen-Coder-1
qwen_coder_scores = []
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen_coder[i], weights=(1.0, 0, 0, 0))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_coder_scores.append(avg_score)
print(f"Average BLEU-1 score: {avg_score}")

Average BLEU-1 score: 0.5253905443522353


In [None]:
# Qwen-Coder-2
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen_coder[i], weights=(0.5, 0.5))
    scores.append(score)

qwen_coder_bleu_2_scores = scores

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_coder_scores.append(avg_score)
print(f"Average BLEU-2 score: {avg_score}")

Average BLEU-2 score: 0.47479109020885785


In [None]:
# Qwen-Coder-3
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen_coder[i], weights=(0.33, 0.33, 0.33))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_coder_scores.append(avg_score)
print(f"Average BLEU-3 score: {avg_score}")

Average BLEU-3 score: 0.46579952886419657


In [None]:
# Qwen-Coder-4
scores = []
for i in range(len(column_manual)):
    score = sentence_bleu([column_manual[i]], column_qwen_coder[i], weights=(0.25, 0.25, 0.25, 0.25))
    scores.append(score)

# Calculate the average BLEU score for the columns
avg_score = sum(scores) / len(scores)
qwen_coder_scores.append(avg_score)
print(f"Average BLEU-4 score: {avg_score}")

Average BLEU-4 score: 0.46202555621537555


In [None]:
print(llama_scores)
print(qwen_scores)
print(qwen_coder_scores)

[0.5702390962471942, 0.538832231561496, 0.5247299625967312, 0.5096852104178654]
[0.5666314450753404, 0.527431303558279, 0.5191499526212822, 0.5096336933477369]
[0.5253905443522353, 0.47479109020885785, 0.46579952886419657, 0.46202555621537555]


In [None]:
count = 0
word_counter_manual = 0
word_counter_llama = 0
word_counter_qwen = 0
word_counter_qwen_coder = 0

for i in range(len(llama_bleu_2_scores)):
    if llama_bleu_2_scores[i] < 0.3 and qwen_bleu_2_scores[i] < 0.3 and qwen_coder_bleu_2_scores[i] < 0.3:
        count = count + 1
        word_counter_manual = word_counter_manual + len(column_manual[i].split())
        word_counter_llama = word_counter_llama + len(column_llama[i].split())
        word_counter_qwen = word_counter_qwen + len(column_qwen[i].split())
        word_counter_qwen_coder = word_counter_qwen_coder + len(column_qwen_coder[i].split())
print(count)

print(word_counter_manual/count)
print(word_counter_llama/count)
print(word_counter_qwen/count)
print(word_counter_qwen_coder/count)

53
11.90566037735849
51.886792452830186
22.641509433962263
17.28301886792453


In [None]:
count = 0
word_counter_manual = 0
word_counter_llama = 0
word_counter_qwen = 0
word_counter_qwen_coder = 0

for i in range(len(llama_bleu_2_scores)):
    if llama_bleu_2_scores[i] > 0.5 and qwen_bleu_2_scores[i] > 0.5 and qwen_coder_bleu_2_scores[i] > 0.5:
        count = count + 1
        word_counter_manual = word_counter_manual + len(column_manual[i].split())
        word_counter_llama = word_counter_llama + len(column_llama[i].split())
        word_counter_qwen = word_counter_qwen + len(column_qwen[i].split())
        word_counter_qwen_coder = word_counter_qwen_coder + len(column_qwen_coder[i].split())
print(count)

print(word_counter_manual/count)
print(word_counter_llama/count)
print(word_counter_qwen/count)
print(word_counter_qwen_coder/count)

71
6.802816901408451
7.619718309859155
7.323943661971831
7.098591549295775
