In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive

Mounted at /content/drive
/content/drive/MyDrive


In [2]:
import re
import csv
import nltk
import statistics

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords, wordnet
from nltk.translate.bleu_score import sentence_bleu

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [3]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


def text_cleaning(text: str) -> str:
    text = str(text)

    text = text.encode("ascii", "ignore").decode()

    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()

    # Tokenization
    words = nltk.word_tokenize(text)

    # POS tagging and Lemmatization
    pos_tags = nltk.pos_tag(words)
    pos_map = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}
    lemmatized_words = [lemmatizer.lemmatize(word, pos_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tags]

    # Stemming
    stemmed_words = [stemmer.stem(word) for word in lemmatized_words]

    return ' '.join(stemmed_words)

In [4]:
data_ids = []
column_manual = []
column_llama = []
column_qwen = []
column_qwen_coder = []
descriptions = {}
manual = {}
llama = {}

with open('descriptions.csv', 'r', encoding='utf-8') as desc_file:
    desc_reader = csv.reader(desc_file, delimiter=',')
    next(desc_reader)
    for row in desc_reader:
        descriptions[row[0]] = row[1]

with open('input-v1.csv', 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    for row in csv_reader:
        data_ids.append(row[0])
        manual[row[0]] = row[1]
        llama[row[0]] = row[2]
        column_manual.append(text_cleaning(row[1]))
        column_llama.append(text_cleaning(row[2]))
        column_qwen.append(text_cleaning(row[3]))
        column_qwen_coder.append(text_cleaning(row[4]))

def calculate_bleu_stats(column_manual, column_model, weights):
    scores = []
    for i in range(len(column_manual)):
        if not column_manual[i] and not column_model[i]:
            scores.append(1.0)
        else:
            scores.append(sentence_bleu([column_manual[i]], column_model[i], weights=weights))

    average = round(sum(scores) / len(scores), 4)
    median = round(statistics.median(scores), 4)
    highest = round(max(scores), 4)
    lowest = round(min(scores), 4)
    overall_range = round(highest - lowest, 4)

    below_03 = [s for s in scores if s < 0.3]
    between_03_05 = [s for s in scores if 0.3 <= s < 0.5]
    above_05 = [s for s in scores if s >= 0.5]
    total = len(scores)

    def get_range(category_scores):
        return round(max(category_scores) - min(category_scores), 4) if category_scores else 0.0

    distribution = {
        "<0.3": {"count": len(below_03), "percentage": round((len(below_03) / total) * 100, 2), "range": get_range(below_03)},
        "0.3 to <0.5": {"count": len(between_03_05), "percentage": round((len(between_03_05) / total) * 100, 2), "range": get_range(between_03_05)},
        ">=0.5": {"count": len(above_05), "percentage": round((len(above_05) / total) * 100, 2), "range": get_range(above_05)}
    }

    print("Distribution:")
    for category, data in distribution.items():
        print(f"  {category}: Count = {data['count']}, Percentage = {data['percentage']}%, Range = {data['range']}")

    return {
        "average": average,
        "median": median,
        "highest": highest,
        "lowest": lowest,
        "overall_range": overall_range,
        "scores": scores,
        "distribution": distribution
    }

def compute_bleu_scores(column_manual, column_llama, column_qwen, column_qwen_coder):
    models = {"Llama": column_llama, "Qwen": column_qwen, "Qwen-Coder": column_qwen_coder}
    weights_list = {"BLEU-1": (1.0, 0, 0, 0), "BLEU-2": (0.5, 0.5), "BLEU-3": (0.33, 0.33, 0.33), "BLEU-4": (0.25, 0.25, 0.25, 0.25)}

    results = {}
    for model_name, column_model in models.items():
        results[model_name] = {}
        for bleu_name, weights in weights_list.items():
            stats = calculate_bleu_stats(column_manual, column_model, weights)
            results[model_name][bleu_name] = stats
            print(f"{model_name} {bleu_name} - Avg: {stats['average']}, Median: {stats['median']}, Highest: {stats['highest']}, Lowest: {stats['lowest']}, Overall Range: {stats['overall_range']}")

    return results

results = compute_bleu_scores(column_manual, column_llama, column_qwen, column_qwen_coder)

def get_low_bleu_ids(threshold=0.3):
    low_bleu_ids = [data_ids[i] for i in range(len(data_ids)) if all(results[model]['BLEU-2']['scores'][i] < threshold for model in results)]
    return low_bleu_ids

low_bleu_ids = get_low_bleu_ids()
print(f"Number of IDs with low BLEU scores: {len(low_bleu_ids)}")
print("Low BLEU IDs:", low_bleu_ids)

with open('low_bleu_data_updated.csv', 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(["ID", "Description", "Manual", "Llama"])
    for id in low_bleu_ids:
        csv_writer.writerow([id, descriptions.get(id, ""), manual.get(id, ""), llama.get(id, "")])

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Distribution:
  <0.3: Count = 74, Percentage = 35.92%, Range = 0.2989
  0.3 to <0.5: Count = 14, Percentage = 6.8%, Range = 0.1617
  >=0.5: Count = 118, Percentage = 57.28%, Range = 0.5
Llama BLEU-1 - Avg: 0.5742, Median: 0.6943, Highest: 1.0, Lowest: 0, Overall Range: 1.0
Distribution:
  <0.3: Count = 76, Percentage = 36.89%, Range = 0.2673
  0.3 to <0.5: Count = 16, Percentage = 7.77%, Range = 0.1603
  >=0.5: Count = 114, Percentage = 55.34%, Range = 0.4951
Llama BLEU-2 - Avg: 0.5585, Median: 0.6601, Highest: 1.0, Lowest: 0, Overall Range: 1.0
Distribution:
  <0.3: Count = 77, Percentage = 37.38%, Range = 0.2789
  0.3 to <0.5: Count = 19, Percentage = 9.22%, Range = 0.1945
  >=0.5: Count = 110, Percentage = 53.4%, Range = 0.4967
Llama BLEU-3 - Avg: 0.5517, Median: 0.65, Highest: 1.0, Lowest: 0, Overall Range: 1.0
Distribution:
  <0.3: Count = 80, Percentage = 38.83%, Range = 0.298
  0.3 to <0.5: Count = 20, Percentage = 9.71%, Range = 0.1918
  >=0.5: Count = 106, Percentage = 51.46%,