# **SETTING THE LLM**

In [None]:
%%capture
# installing libraries
!pip install transformers
!pip install torch

In [None]:
# logging in the model's website
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
The token `key` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate w

In [None]:
# loading data to be computed

import pandas as pd
import csv
from google.colab import drive

# Mount Google Drive
drive.mount('/content/gdrive')

input_file = r'/content/gdrive/MyDrive/dati_sna/submissions_decimato_2022.csv'

# loading sumbissions into a list of authors and titles
authors = []
titles = []
ids=[]
with open(input_file, mode='r') as file:
    csvreader = csv.reader(file)
    next(csvreader) # skipping first row
    for row in csvreader:
        # discarding deleted users
        if row[0] != 'u/[deleted]':
            author, sub_id , title, score , created= row
            authors.append(row[0])
            ids.append(row[1])
            titles.append(row[2])
if len(authors) != len(titles):
    print("error, authors and titles are not the same length")

print(len(titles), "titles to classify")

Mounted at /content/gdrive
84741 titles to classify


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
# move model on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# check if the model is on the GPU (cuda:0 is good)
print(next(model.parameters()).device)

cuda:0


In [None]:
#Splitting of the data in order to use free colab

def split_list_equal(lst):
    n = len(lst)
    return lst[:n//3], lst[n//3:2*n//3], lst[2*n//3:]

titles1, titles2, titles3 = split_list_equal(titles)
authors1, authors2, authors3 = split_list_equal(authors)
ids1, ids2, ids3 = split_list_equal(ids)

# **VEGETARIAN/OMNIVOROUS**

## SPLIT 1

In [None]:
from math import ceil
import time
import torch
import gc

# to change in other splits
titles=titles1
authors=authors1
ids=ids1

# Function to calculate token length
def token_length(input_text):
    tokens = tokenizer(input_text, return_tensors="pt").input_ids
    return tokens.shape[-1]

# Set padding token and padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Ensure correct padding for decoder-only architectures

# Defining the input instruction text
instruction_text = "Classify the following titles into: vegetarian, omnivorous:\n\nMaryland style Crab Cakes with clarified butter and lemon. [Homemade] #### omnivorous\n[Homemade] Jalapeno-Ghost Pepper Mac and Cheese with Pulled Pork #### omnivorous\n[homemade] steak sandwiches #### omnivorous\n[Homemade] Vegetable Pasta Salad with Red Wine Vinegar Vinaigrette #### vegetarian\n[Homemade] crispy gnocchi & veg, in a spinach, leek, garlic and harissa sauce. Topped with (not homemade) beetroot falafel. #### vegetarian\n"

# Hashtag for classification separation
hashtag = " #### "

# Batch size configuration
batch_size = 50

# Function to process each batch
def process_batch(batch_titles):
    input_texts = [instruction_text + title + hashtag for title in batch_titles]

    # Tokenize the batch with padding and truncation
    encoding = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    input_ids = encoding.input_ids.to(device)  # Move input_ids to the same device as the model
    attention_mask = encoding.attention_mask.to(device)  # Move attention_mask to the same device

    # Set max_length to the maximum token length in the batch (to avoid generating too many tokens)
    max_lengths = [token_length(input_text) + 3 for input_text in input_texts]
    max_length = max(max_lengths)  # Get the maximum length for generation

    # Generate output from the model
    generated_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        do_sample=True,
        eos_token_id=tokenizer.encode("####")[0],
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode results and classify them
    batch_results = []
    for generated_id in generated_ids:
        generated_text = tokenizer.decode(generated_id, skip_special_tokens=True)

        # Extract the classification from the generated text
        if hashtag in generated_text:
            split_text = generated_text.split(hashtag)
            if len(split_text) > 1 and split_text[-1].strip():  # Check presence of text after "####"
                classification = split_text[-1].split()[0]
            else:
                classification = "unknown"  # Fallback if no classification found
        else:
            classification = "unknown"  # Fallback if "####" not found

        batch_results.append(classification)

    return batch_results

# Divide the titles into batches
num_batches = ceil(len(titles) / batch_size)
results = []

# Start processing batches
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(titles))
    batch_titles = titles[start:end]

    if i == 0:
        # Process the first batch and estimate time for remaining batches
        start_time = time.time()
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)
        first_batch_time = time.time() - start_time
        average_time = first_batch_time * num_batches
        print(f"First batch time: {first_batch_time:.2f} seconds")
        print(f"Estimated time for all batches: {average_time:.2f} seconds")
    else:
        # Process subsequent batches
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)

    # Clear GPU memory after each batch to avoid memory fragmentation
    torch.cuda.empty_cache()
    gc.collect()

    print(f"Processed batch {i + 1}/{num_batches}")

# Output the final results
print(results)


First batch time: 5.37 seconds
Estimated time for all batches: 3035.80 seconds
Processed batch 1/565
Processed batch 2/565
Processed batch 3/565
Processed batch 4/565
Processed batch 5/565
Processed batch 6/565
Processed batch 7/565
Processed batch 8/565
Processed batch 9/565
Processed batch 10/565
Processed batch 11/565
Processed batch 12/565
Processed batch 13/565
Processed batch 14/565
Processed batch 15/565
Processed batch 16/565
Processed batch 17/565
Processed batch 18/565
Processed batch 19/565
Processed batch 20/565
Processed batch 21/565
Processed batch 22/565
Processed batch 23/565
Processed batch 24/565
Processed batch 25/565
Processed batch 26/565
Processed batch 27/565
Processed batch 28/565
Processed batch 29/565
Processed batch 30/565
Processed batch 31/565
Processed batch 32/565
Processed batch 33/565
Processed batch 34/565
Processed batch 35/565
Processed batch 36/565
Processed batch 37/565
Processed batch 38/565
Processed batch 39/565
Processed batch 40/565
Processed 

In [None]:
# writing the results into a csv file

import csv
from google.colab import drive

drive.mount('/content/gdrive')

list1 = authors
list2 = results
list3 = ids

# Nome del file CSV
file_csv = r"/content/gdrive/MyDrive/dati_sna/veg_titles_2022_split_1.csv"

# write on the csv
with open(file_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["author", "classification", "id"])
    i=0
    # writing rows
    for element1, element2, element3 in zip(list1, list2, list3):
        writer.writerow([element1, element2, element3])
        if element2 == "vegetarian" or element2 == "omnivorous":
            i+=1
print("percentage of good classification:",i/len(titles)*100,"%")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
percentage of good classification: 94.48437002159521 %


## SPLIT 2

In [None]:
from math import ceil
import time
import torch
import gc

# to change in other splits
titles=titles2
authors=authors2
ids=ids2

# Function to calculate token length
def token_length(input_text):
    tokens = tokenizer(input_text, return_tensors="pt").input_ids
    return tokens.shape[-1]

# Set padding token and padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Ensure correct padding for decoder-only architectures

# Defining the input instruction text
instruction_text = "Classify the following titles into: vegetarian, omnivorous:\n\nMaryland style Crab Cakes with clarified butter and lemon. [Homemade] #### omnivorous\n[Homemade] Jalapeno-Ghost Pepper Mac and Cheese with Pulled Pork #### omnivorous\n[homemade] steak sandwiches #### omnivorous\n[Homemade] Vegetable Pasta Salad with Red Wine Vinegar Vinaigrette #### vegetarian\n[Homemade] crispy gnocchi & veg, in a spinach, leek, garlic and harissa sauce. Topped with (not homemade) beetroot falafel. #### vegetarian\n"

# Hashtag for classification separation
hashtag = " #### "

# Batch size configuration
batch_size = 50

# Function to process each batch
def process_batch(batch_titles):
    input_texts = [instruction_text + title + hashtag for title in batch_titles]

    # Tokenize the batch with padding and truncation
    encoding = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    input_ids = encoding.input_ids.to(device)  # Move input_ids to the same device as the model
    attention_mask = encoding.attention_mask.to(device)  # Move attention_mask to the same device

    # Set max_length to the maximum token length in the batch (to avoid generating too many tokens)
    max_lengths = [token_length(input_text) + 3 for input_text in input_texts]
    max_length = max(max_lengths)  # Get the maximum length for generation

    # Generate output from the model
    generated_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        do_sample=True,
        eos_token_id=tokenizer.encode("####")[0],
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode results and classify them
    batch_results = []
    for generated_id in generated_ids:
        generated_text = tokenizer.decode(generated_id, skip_special_tokens=True)

        # Extract the classification from the generated text
        if hashtag in generated_text:
            split_text = generated_text.split(hashtag)
            if len(split_text) > 1 and split_text[-1].strip():  # Check presence of text after "####"
                classification = split_text[-1].split()[0]
            else:
                classification = "unknown"  # Fallback if no classification found
        else:
            classification = "unknown"  # Fallback if "####" not found

        batch_results.append(classification)

    return batch_results

# Divide the titles into batches
num_batches = ceil(len(titles) / batch_size)
results = []

# Start processing batches
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(titles))
    batch_titles = titles[start:end]

    if i == 0:
        # Process the first batch and estimate time for remaining batches
        start_time = time.time()
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)
        first_batch_time = time.time() - start_time
        average_time = first_batch_time * num_batches
        print(f"First batch time: {first_batch_time:.2f} seconds")
        print(f"Estimated time for all batches: {average_time:.2f} seconds")
    else:
        # Process subsequent batches
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)

    # Clear GPU memory after each batch to avoid memory fragmentation
    torch.cuda.empty_cache()
    gc.collect()

    print(f"Processed batch {i + 1}/{num_batches}")

# Output the final results
print(results)


First batch time: 6.23 seconds
Estimated time for all batches: 3518.92 seconds
Processed batch 1/565
Processed batch 2/565
Processed batch 3/565
Processed batch 4/565
Processed batch 5/565
Processed batch 6/565
Processed batch 8/565
Processed batch 9/565
Processed batch 10/565
Processed batch 11/565
Processed batch 12/565
Processed batch 13/565
Processed batch 14/565
Processed batch 15/565
Processed batch 16/565
Processed batch 17/565
Processed batch 18/565
Processed batch 19/565
Processed batch 20/565
Processed batch 21/565
Processed batch 22/565
Processed batch 23/565
Processed batch 24/565
Processed batch 25/565
Processed batch 26/565
Processed batch 27/565
Processed batch 28/565
Processed batch 29/565
Processed batch 30/565
Processed batch 31/565
Processed batch 32/565
Processed batch 33/565
Processed batch 34/565
Processed batch 35/565
Processed batch 36/565
Processed batch 37/565
Processed batch 38/565
Processed batch 39/565
Processed batch 40/565
Processed batch 41/565
Processed

In [None]:
# writing the results into a csv file

import csv
from google.colab import drive

drive.mount('/content/gdrive')

list1 = authors
list2 = results
list3 = ids

# Nome del file CSV
file_csv = r"/content/gdrive/MyDrive/dati_sna/veg_titles_2022_split_2.csv"

# write on the csv
with open(file_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["author", "classification", "id"])
    i=0
    # writing rows
    for element1, element2, element3 in zip(list1, list2, list3):
        writer.writerow([element1, element2, element3])
        if element2 == "vegetarian" or element2 == "omnivorous":
            i+=1
print("percentage of good classification:",i/len(titles)*100,"%")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
percentage of good classification: 94.20115410486069 %


## SPLIT 3

In [None]:
from math import ceil
import time
import torch
import gc

# to change in other splits
titles=titles3
authors=authors3
ids=ids3

# Function to calculate token length
def token_length(input_text):
    tokens = tokenizer(input_text, return_tensors="pt").input_ids
    return tokens.shape[-1]

# Set padding token and padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Ensure correct padding for decoder-only architectures

# Defining the input instruction text
instruction_text = "Classify the following titles into: vegetarian, omnivorous:\n\nMaryland style Crab Cakes with clarified butter and lemon. [Homemade] #### omnivorous\n[Homemade] Jalapeno-Ghost Pepper Mac and Cheese with Pulled Pork #### omnivorous\n[homemade] steak sandwiches #### omnivorous\n[Homemade] Vegetable Pasta Salad with Red Wine Vinegar Vinaigrette #### vegetarian\n[Homemade] crispy gnocchi & veg, in a spinach, leek, garlic and harissa sauce. Topped with (not homemade) beetroot falafel. #### vegetarian\n"

# Hashtag for classification separation
hashtag = " #### "

# Batch size configuration
batch_size = 50

# Function to process each batch
def process_batch(batch_titles):
    input_texts = [instruction_text + title + hashtag for title in batch_titles]

    # Tokenize the batch with padding and truncation
    encoding = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    input_ids = encoding.input_ids.to(device)  # Move input_ids to the same device as the model
    attention_mask = encoding.attention_mask.to(device)  # Move attention_mask to the same device

    # Set max_length to the maximum token length in the batch (to avoid generating too many tokens)
    max_lengths = [token_length(input_text) + 3 for input_text in input_texts]
    max_length = max(max_lengths)  # Get the maximum length for generation

    # Generate output from the model
    generated_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        do_sample=True,
        eos_token_id=tokenizer.encode("####")[0],
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode results and classify them
    batch_results = []
    for generated_id in generated_ids:
        generated_text = tokenizer.decode(generated_id, skip_special_tokens=True)

        # Extract the classification from the generated text
        if hashtag in generated_text:
            split_text = generated_text.split(hashtag)
            if len(split_text) > 1 and split_text[-1].strip():  # Check presence of text after "####"
                classification = split_text[-1].split()[0]
            else:
                classification = "unknown"  # Fallback if no classification found
        else:
            classification = "unknown"  # Fallback if "####" not found

        batch_results.append(classification)

    return batch_results

# Divide the titles into batches
num_batches = ceil(len(titles) / batch_size)
results = []

# Start processing batches
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(titles))
    batch_titles = titles[start:end]

    if i == 0:
        # Process the first batch and estimate time for remaining batches
        start_time = time.time()
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)
        first_batch_time = time.time() - start_time
        average_time = first_batch_time * num_batches
        print(f"First batch time: {first_batch_time:.2f} seconds")
        print(f"Estimated time for all batches: {average_time:.2f} seconds")
    else:
        # Process subsequent batches
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)

    # Clear GPU memory after each batch to avoid memory fragmentation
    torch.cuda.empty_cache()
    gc.collect()

    print(f"Processed batch {i + 1}/{num_batches}")

# Output the final results
print(results)


First batch time: 4.72 seconds
Estimated time for all batches: 2665.05 seconds
Processed batch 1/565
Processed batch 2/565
Processed batch 3/565
Processed batch 4/565
Processed batch 5/565
Processed batch 6/565
Processed batch 7/565
Processed batch 8/565
Processed batch 9/565
Processed batch 10/565
Processed batch 11/565
Processed batch 12/565
Processed batch 13/565
Processed batch 14/565
Processed batch 15/565
Processed batch 16/565
Processed batch 17/565
Processed batch 18/565
Processed batch 19/565
Processed batch 20/565
Processed batch 21/565
Processed batch 22/565
Processed batch 23/565
Processed batch 24/565
Processed batch 25/565
Processed batch 26/565
Processed batch 27/565
Processed batch 28/565
Processed batch 29/565
Processed batch 30/565
Processed batch 31/565
Processed batch 32/565
Processed batch 33/565
Processed batch 34/565
Processed batch 35/565
Processed batch 36/565
Processed batch 37/565
Processed batch 38/565
Processed batch 39/565
Processed batch 40/565
Processed 

In [None]:
# writing the results into a csv file

import csv
from google.colab import drive

drive.mount('/content/gdrive')

list1 = authors
list2 = results
list3 = ids

# Nome del file CSV
file_csv = r"/content/gdrive/MyDrive/dati_sna/veg_titles_2022_split_3.csv"

# write on the csv
with open(file_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["author", "classification", "id"])
    i=0
    # writing rows
    for element1, element2, element3 in zip(list1, list2, list3):
        writer.writerow([element1, element2, element3])
        if element2 == "vegetarian" or element2 == "omnivorous":
            i+=1
print("percentage of good classification:",i/len(titles)*100,"%")

## Split Combining

In [None]:
import pandas as pd

# Percorsi dei file CSV
dati_sna_path = "/content/gdrive/MyDrive/dati_sna/"
file_names = [
    "veg_titles_2022_split_1.csv",
    "veg_titles_2022_split_2.csv",
    "veg_titles_2022_split_3.csv"
]

# Leggi e unisci i file
combined_df = pd.concat([pd.read_csv(dati_sna_path + file) for file in file_names])

# Salva il file unito
output_file = dati_sna_path + "veg_titles_2022.csv"
combined_df.to_csv(output_file, index=False)

print(f"File unito salvato in: {output_file}")


# **SWEET/SAVORY**

## SPLIT 1

In [None]:
from math import ceil
import time
import torch
import gc

# to change in other splits
titles=titles1
authors=authors1
ids=ids1

# Function to calculate token length
def token_length(input_text):
    tokens = tokenizer(input_text, return_tensors="pt").input_ids
    return tokens.shape[-1]

# Set padding token and padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Ensure correct padding for decoder-only architectures

# Defining the input instruction text
instruction_text = "Classify the following titles into: sweet, savory:\n\nMaryland style Crab Cakes with clarified butter and lemon. [Homemade] #### sweet\n[homemade] Chili #### savory\n[homemade] steak sandwiches #### savory\n[Homemade] Vegetable Pasta Salad with Red Wine Vinegar Vinaigrette #### savory\n[I ATE] Red velvet black forest cake and lemon custard filled profiteroles with craquelin #### sweet\n"

# Hashtag for classification separation
hashtag = " #### "

# Batch size configuration
batch_size = 50

# Function to process each batch
def process_batch(batch_titles):
    input_texts = [instruction_text + title + hashtag for title in batch_titles]

    # Tokenize the batch with padding and truncation
    encoding = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    input_ids = encoding.input_ids.to(device)  # Move input_ids to the same device as the model
    attention_mask = encoding.attention_mask.to(device)  # Move attention_mask to the same device

    # Set max_length to the maximum token length in the batch (to avoid generating too many tokens)
    max_lengths = [token_length(input_text) + 3 for input_text in input_texts]
    max_length = max(max_lengths)  # Get the maximum length for generation

    # Generate output from the model
    generated_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        do_sample=True,
        eos_token_id=tokenizer.encode("####")[0],
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode results and classify them
    batch_results = []
    for generated_id in generated_ids:
        generated_text = tokenizer.decode(generated_id, skip_special_tokens=True)

        # Extract the classification from the generated text
        if hashtag in generated_text:
            split_text = generated_text.split(hashtag)
            if len(split_text) > 1 and split_text[-1].strip():  # Check presence of text after "####"
                classification = split_text[-1].split()[0]
            else:
                classification = "unknown"  # Fallback if no classification found
        else:
            classification = "unknown"  # Fallback if "####" not found

        batch_results.append(classification)

    return batch_results

# Divide the titles into batches
num_batches = ceil(len(titles) / batch_size)
results = []

# Start processing batches
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(titles))
    batch_titles = titles[start:end]

    if i == 0:
        # Process the first batch and estimate time for remaining batches
        start_time = time.time()
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)
        first_batch_time = time.time() - start_time
        average_time = first_batch_time * num_batches
        print(f"First batch time: {first_batch_time:.2f} seconds")
        print(f"Estimated time for all batches: {average_time:.2f} seconds")
    else:
        # Process subsequent batches
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)

    # Clear GPU memory after each batch to avoid memory fragmentation
    torch.cuda.empty_cache()
    gc.collect()

    print(f"Processed batch {i + 1}/{num_batches}")

# Output the final results
print(results)


First batch time: 4.33 seconds
Estimated time for all batches: 2448.26 seconds
Processed batch 1/565
Processed batch 2/565
Processed batch 3/565
Processed batch 4/565
Processed batch 5/565
Processed batch 6/565
Processed batch 7/565
Processed batch 8/565
Processed batch 9/565
Processed batch 10/565
Processed batch 11/565
Processed batch 12/565
Processed batch 13/565
Processed batch 14/565
Processed batch 15/565
Processed batch 16/565
Processed batch 17/565
Processed batch 18/565
Processed batch 19/565
Processed batch 20/565
Processed batch 21/565
Processed batch 22/565
Processed batch 23/565
Processed batch 24/565
Processed batch 25/565
Processed batch 26/565
Processed batch 27/565
Processed batch 28/565
Processed batch 29/565
Processed batch 30/565
Processed batch 31/565
Processed batch 32/565
Processed batch 33/565
Processed batch 34/565
Processed batch 35/565
Processed batch 36/565
Processed batch 37/565
Processed batch 38/565
Processed batch 39/565
Processed batch 40/565
Processed 

In [None]:
# writing the results into a csv file

import csv
from google.colab import drive

drive.mount('/content/gdrive')

list1 = authors
list2 = results
list3 = ids

# Nome del file CSV
file_csv = r"/content/gdrive/MyDrive/dati_sna/sweet_titles_2022_split_1.csv"

# write on the csv
with open(file_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["author", "classification", "id"])
    i=0
    # writing rows
    for element1, element2, element3 in zip(list1, list2, list3):
        writer.writerow([element1, element2, element3])
        if element2 == "sweet" or element2 == "savory" or element2=="Savory" or element2== "Sweet": #be careful for the capital
            i+=1
print("percentage of good classification:",i/len(titles)*100,"%")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
percentage of good classification: 96.56246681063476 %


## SPLIT 2

In [None]:
from math import ceil
import time
import torch
import gc

# to change in other splits
titles=titles2
authors=authors2
ids=ids2

# Function to calculate token length
def token_length(input_text):
    tokens = tokenizer(input_text, return_tensors="pt").input_ids
    return tokens.shape[-1]

# Set padding token and padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Ensure correct padding for decoder-only architectures

# Defining the input instruction text
instruction_text = "Classify the following titles into: sweet, savory:\n\nMaryland style Crab Cakes with clarified butter and lemon. [Homemade] #### sweet\n[homemade] Chili #### savory\n[homemade] steak sandwiches #### savory\n[Homemade] Vegetable Pasta Salad with Red Wine Vinegar Vinaigrette #### savory\n[I ATE] Red velvet black forest cake and lemon custard filled profiteroles with craquelin #### sweet\n"

# Hashtag for classification separation
hashtag = " #### "

# Batch size configuration
batch_size = 50

# Function to process each batch
def process_batch(batch_titles):
    input_texts = [instruction_text + title + hashtag for title in batch_titles]

    # Tokenize the batch with padding and truncation
    encoding = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    input_ids = encoding.input_ids.to(device)  # Move input_ids to the same device as the model
    attention_mask = encoding.attention_mask.to(device)  # Move attention_mask to the same device

    # Set max_length to the maximum token length in the batch (to avoid generating too many tokens)
    max_lengths = [token_length(input_text) + 3 for input_text in input_texts]
    max_length = max(max_lengths)  # Get the maximum length for generation

    # Generate output from the model
    generated_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        do_sample=True,
        eos_token_id=tokenizer.encode("####")[0],
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode results and classify them
    batch_results = []
    for generated_id in generated_ids:
        generated_text = tokenizer.decode(generated_id, skip_special_tokens=True)

        # Extract the classification from the generated text
        if hashtag in generated_text:
            split_text = generated_text.split(hashtag)
            if len(split_text) > 1 and split_text[-1].strip():  # Check presence of text after "####"
                classification = split_text[-1].split()[0]
            else:
                classification = "unknown"  # Fallback if no classification found
        else:
            classification = "unknown"  # Fallback if "####" not found

        batch_results.append(classification)

    return batch_results

# Divide the titles into batches
num_batches = ceil(len(titles) / batch_size)
results = []

# Start processing batches
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(titles))
    batch_titles = titles[start:end]

    if i == 0:
        # Process the first batch and estimate time for remaining batches
        start_time = time.time()
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)
        first_batch_time = time.time() - start_time
        average_time = first_batch_time * num_batches
        print(f"First batch time: {first_batch_time:.2f} seconds")
        print(f"Estimated time for all batches: {average_time:.2f} seconds")
    else:
        # Process subsequent batches
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)

    # Clear GPU memory after each batch to avoid memory fragmentation
    torch.cuda.empty_cache()
    gc.collect()

    print(f"Processed batch {i + 1}/{num_batches}")

# Output the final results
print(results)


First batch time: 5.77 seconds
Estimated time for all batches: 3258.44 seconds
Processed batch 1/565
Processed batch 2/565
Processed batch 3/565
Processed batch 4/565
Processed batch 5/565
Processed batch 6/565
Processed batch 7/565
Processed batch 8/565
Processed batch 9/565
Processed batch 10/565
Processed batch 11/565
Processed batch 12/565
Processed batch 13/565
Processed batch 14/565
Processed batch 15/565
Processed batch 16/565
Processed batch 17/565
Processed batch 18/565
Processed batch 19/565
Processed batch 20/565
Processed batch 21/565
Processed batch 22/565
Processed batch 23/565
Processed batch 24/565
Processed batch 25/565
Processed batch 26/565
Processed batch 27/565
Processed batch 28/565
Processed batch 29/565
Processed batch 30/565
Processed batch 31/565
Processed batch 32/565
Processed batch 33/565
Processed batch 34/565
Processed batch 35/565
Processed batch 36/565
Processed batch 37/565
Processed batch 38/565
Processed batch 39/565
Processed batch 40/565
Processed 

In [None]:
# writing the results into a csv file

import csv
from google.colab import drive

drive.mount('/content/gdrive')

list1 = authors
list2 = results
list3 = ids

# Nome del file CSV
file_csv = r"/content/gdrive/MyDrive/dati_sna/sweet_titles_2022_split_2.csv"

# write on the csv
with open(file_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["author", "classification", "id"])
    i=0
    # writing rows
    for element1, element2, element3 in zip(list1, list2, list3):
        writer.writerow([element1, element2, element3])
        if element2 == "sweet" or element2 == "savory" or element2=="Savory":
            i+=1
print("percentage of good classification:",i/len(titles)*100,"%")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
percentage of good classification: 96.20136651679825 %


## SPLIT 3

In [None]:
from math import ceil
import time
import torch
import gc

# to change in other splits
titles=titles3
authors=authors3
ids=ids3

# Function to calculate token length
def token_length(input_text):
    tokens = tokenizer(input_text, return_tensors="pt").input_ids
    return tokens.shape[-1]

# Set padding token and padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Ensure correct padding for decoder-only architectures

# Defining the input instruction text
instruction_text = "Classify the following titles into: sweet, savory:\n\nMaryland style Crab Cakes with clarified butter and lemon. [Homemade] #### sweet\n[homemade] Chili #### savory\n[homemade] steak sandwiches #### savory\n[Homemade] Vegetable Pasta Salad with Red Wine Vinegar Vinaigrette #### savory\n[I ATE] Red velvet black forest cake and lemon custard filled profiteroles with craquelin #### sweet\n"

# Hashtag for classification separation
hashtag = " #### "

# Batch size configuration
batch_size = 50

# Function to process each batch
def process_batch(batch_titles):
    input_texts = [instruction_text + title + hashtag for title in batch_titles]

    # Tokenize the batch with padding and truncation
    encoding = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    input_ids = encoding.input_ids.to(device)  # Move input_ids to the same device as the model
    attention_mask = encoding.attention_mask.to(device)  # Move attention_mask to the same device

    # Set max_length to the maximum token length in the batch (to avoid generating too many tokens)
    max_lengths = [token_length(input_text) + 3 for input_text in input_texts]
    max_length = max(max_lengths)  # Get the maximum length for generation

    # Generate output from the model
    generated_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        do_sample=True,
        eos_token_id=tokenizer.encode("####")[0],
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode results and classify them
    batch_results = []
    for generated_id in generated_ids:
        generated_text = tokenizer.decode(generated_id, skip_special_tokens=True)

        # Extract the classification from the generated text
        if hashtag in generated_text:
            split_text = generated_text.split(hashtag)
            if len(split_text) > 1 and split_text[-1].strip():  # Check presence of text after "####"
                classification = split_text[-1].split()[0]
            else:
                classification = "unknown"  # Fallback if no classification found
        else:
            classification = "unknown"  # Fallback if "####" not found

        batch_results.append(classification)

    return batch_results

# Divide the titles into batches
num_batches = ceil(len(titles) / batch_size)
results = []

# Start processing batches
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(titles))
    batch_titles = titles[start:end]

    if i == 0:
        # Process the first batch and estimate time for remaining batches
        start_time = time.time()
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)
        first_batch_time = time.time() - start_time
        average_time = first_batch_time * num_batches
        print(f"First batch time: {first_batch_time:.2f} seconds")
        print(f"Estimated time for all batches: {average_time:.2f} seconds")
    else:
        # Process subsequent batches
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)

    # Clear GPU memory after each batch to avoid memory fragmentation
    torch.cuda.empty_cache()
    gc.collect()

    print(f"Processed batch {i + 1}/{num_batches}")

# Output the final results
print(results)


First batch time: 4.99 seconds
Estimated time for all batches: 2820.27 seconds
Processed batch 1/565
Processed batch 2/565
Processed batch 3/565
Processed batch 4/565
Processed batch 5/565
Processed batch 6/565
Processed batch 7/565
Processed batch 8/565
Processed batch 9/565
Processed batch 10/565
Processed batch 11/565
Processed batch 12/565
Processed batch 13/565
Processed batch 14/565
Processed batch 15/565
Processed batch 16/565
Processed batch 17/565
Processed batch 18/565
Processed batch 19/565
Processed batch 20/565
Processed batch 21/565
Processed batch 22/565
Processed batch 23/565
Processed batch 24/565
Processed batch 25/565
Processed batch 26/565
Processed batch 27/565
Processed batch 28/565
Processed batch 29/565
Processed batch 30/565
Processed batch 31/565
Processed batch 32/565
Processed batch 33/565
Processed batch 34/565
Processed batch 35/565
Processed batch 36/565
Processed batch 37/565
Processed batch 38/565
Processed batch 39/565
Processed batch 40/565
Processed 

In [None]:
# writing the results into a csv file

import csv
from google.colab import drive

drive.mount('/content/gdrive')

list1 = authors
list2 = results
list3 = ids

# Nome del file CSV
file_csv = r"/content/gdrive/MyDrive/dati_sna/sweet_titles_2022_split_3.csv"

# write on the csv
with open(file_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["author", "classification", "id"])
    i=0
    # writing rows
    for element1, element2, element3 in zip(list1, list2, list3):
        writer.writerow([element1, element2, element3])
        if element2 == "sweet" or element2 == "savory" or element2== "Savory":
            i+=1
print("percentage of good classification:",i/len(titles)*100,"%")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
percentage of good classification: 96.34651467412468 %


## Split Combining

In [None]:
import pandas as pd

# Percorsi dei file CSV
dati_sna_path = "/content/gdrive/MyDrive/dati_sna/"
file_names = [
    "sweet_titles_2022_split_1.csv",
    "sweet_titles_2022_split_2.csv",
    "sweet_titles_2022_split_3.csv"
]

# Leggi e unisci i file
combined_df = pd.concat([pd.read_csv(dati_sna_path + file) for file in file_names])

# Salva il file unito
output_file = dati_sna_path + "sweet_titles_2022.csv"
combined_df.to_csv(output_file, index=False)

print(f"File unito salvato in: {output_file}")


File unito salvato in: /content/gdrive/MyDrive/dati_sna/sweet_titles_2022.csv


# **I ATE-HOMEMADE-PRO/CHEF**

In [None]:
# loading data to be computed

import pandas as pd
import csv
from google.colab import drive

# Mount Google Drive
drive.mount('/content/gdrive')

input_file = r'/content/gdrive/MyDrive/dati_sna/submissions_decimato_2022.csv'

# loading sumbissions into a list of authors and titles
authors = []
titles = []
ids=[]
with open(input_file, mode='r') as file:
    csvreader = csv.reader(file)
    next(csvreader) # skipping first row
    for row in csvreader:
        # discarding deleted users
        if row[0] != 'u/[deleted]':
            author, sub_id , title, score , created= row
            authors.append(row[0])
            ids.append(row[1])
            titles.append(row[2])
if len(authors) != len(titles):
    print("error, authors and titles are not the same length")

print(len(titles), "titles to classify")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
84741 titles to classify


In [None]:
import re

def classify_titles(titles):
    classifications = []
    for title in titles:
        lower_title = title.lower()
        if re.search(r'\b(i ate|ate)\b', lower_title):
            classifications.append(0)
        elif re.search(r'\b(homemade)\b', lower_title):
            classifications.append(1)
        elif re.search(r'\b(pro|chef)\b', lower_title):
            classifications.append(2)
        else:
            classifications.append("UNKNOWN")
    return classifications

In [None]:
# writing the results into a csv file

import csv
from google.colab import drive

drive.mount('/content/gdrive')

list1 = authors
list2 = classify_titles(titles)
list3 = ids

# Nome del file CSV
file_csv = r"/content/gdrive/MyDrive/dati_sna/tag_titles_2022.csv"

# write on the csv
with open(file_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["author", "classification", "id"])
    i=0
    # writing rows
    for element1, element2, element3 in zip(list1, list2, list3):
        writer.writerow([element1, element2, element3])
        if element2 == 0 or element2 == 1 or element2== 2:
            i+=1
print("percentage of good classification:",i/len(titles)*100,"%")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
percentage of good classification: 99.96341794408846 %


# ETHNIC CLASSIFICATION

## SPLIT 1

In [None]:
from math import ceil
import time
import torch
import gc

# to change in other splits
titles=titles1
authors=authors1
ids=ids1

# Function to calculate token length
def token_length(input_text):
    tokens = tokenizer(input_text, return_tensors="pt").input_ids
    return tokens.shape[-1]

# Set padding token and padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Ensure correct padding for decoder-only architectures

# Defining the input instruction text
instruction_text = "Classify the following titles into the most probable cuisine ethnicity: Italian, American, Mexican, French, Chinese, Indian, Japanese, Middle Eastern, etc.\n\n[homemade] hummus I made today #### Middle Eastern\n[Homemade] Beef Shoyu Ramen #### Japanese\n[homemade] curry chicken, rice, & naan #### Indian\n[homemade] Chicken tacos and guacamole #### Mexican\n[Homemade] bacon and artichoke hearts pizza #### Italian\n"

# Hashtag for classification separation
hashtag = " #### "

# Batch size configuration
batch_size = 50

# Function to process each batch
def process_batch(batch_titles):
    input_texts = [instruction_text + title + hashtag for title in batch_titles]

    # Tokenize the batch with padding and truncation
    encoding = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    input_ids = encoding.input_ids.to(device)  # Move input_ids to the same device as the model
    attention_mask = encoding.attention_mask.to(device)  # Move attention_mask to the same device

    # Set max_length to the maximum token length in the batch (to avoid generating too many tokens)
    max_lengths = [token_length(input_text) + 3 for input_text in input_texts]
    max_length = max(max_lengths)  # Get the maximum length for generation

    # Generate output from the model
    generated_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        do_sample=True,
        eos_token_id=tokenizer.encode("####")[0],
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode results and classify them
    batch_results = []
    for generated_id in generated_ids:
        generated_text = tokenizer.decode(generated_id, skip_special_tokens=True)

        # Extract the classification from the generated text
        if hashtag in generated_text:
            split_text = generated_text.split(hashtag)
            if len(split_text) > 1 and split_text[-1].strip():  # Check presence of text after "####"
                classification = split_text[-1].split()[0]
            else:
                classification = "unknown"  # Fallback if no classification found
        else:
            classification = "unknown"  # Fallback if "####" not found

        batch_results.append(classification)

    return batch_results

# Divide the titles into batches
num_batches = ceil(len(titles) / batch_size)
results = []

# Start processing batches
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(titles))
    batch_titles = titles[start:end]

    if i == 0:
        # Process the first batch and estimate time for remaining batches
        start_time = time.time()
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)
        first_batch_time = time.time() - start_time
        average_time = first_batch_time * num_batches
        print(f"First batch time: {first_batch_time:.2f} seconds")
        print(f"Estimated time for all batches: {average_time:.2f} seconds")
    else:
        # Process subsequent batches
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)

    # Clear GPU memory after each batch to avoid memory fragmentation
    torch.cuda.empty_cache()
    gc.collect()

    print(f"Processed batch {i + 1}/{num_batches}")

# Output the final results
print(results)


First batch time: 4.73 seconds
Estimated time for all batches: 2671.32 seconds
Processed batch 1/565
Processed batch 2/565
Processed batch 3/565
Processed batch 4/565
Processed batch 5/565
Processed batch 6/565
Processed batch 7/565
Processed batch 8/565
Processed batch 9/565
Processed batch 10/565
Processed batch 11/565
Processed batch 12/565
Processed batch 13/565
Processed batch 14/565
Processed batch 15/565
Processed batch 16/565
Processed batch 17/565
Processed batch 18/565
Processed batch 19/565
Processed batch 20/565
Processed batch 21/565
Processed batch 22/565
Processed batch 23/565
Processed batch 24/565
Processed batch 25/565
Processed batch 26/565
Processed batch 27/565
Processed batch 28/565
Processed batch 29/565
Processed batch 30/565
Processed batch 31/565
Processed batch 32/565
Processed batch 33/565
Processed batch 34/565
Processed batch 35/565
Processed batch 36/565
Processed batch 37/565
Processed batch 38/565
Processed batch 39/565
Processed batch 40/565
Processed 

In [None]:
# writing the results into a csv file

import csv
from google.colab import drive

drive.mount('/content/gdrive')

list1 = authors
list2 = results
list3 = ids

# Nome del file CSV
file_csv = r"/content/gdrive/MyDrive/dati_sna/ethnic_titles_2022_split_1.csv"

# write on the csv
with open(file_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["author", "classification", "id"])
    i=0
    # writing rows
    for element1, element2, element3 in zip(list1, list2, list3):
        writer.writerow([element1, element2, element3])

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## SPLIT 2

In [None]:
from math import ceil
import time
import torch
import gc

# to change in other splits
titles=titles2
authors=authors2
ids=ids2

# Function to calculate token length
def token_length(input_text):
    tokens = tokenizer(input_text, return_tensors="pt").input_ids
    return tokens.shape[-1]

# Set padding token and padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Ensure correct padding for decoder-only architectures

# Defining the input instruction text
instruction_text = "Classify the following titles into the most probable cuisine ethnicity: Italian, American, Mexican, French, Chinese, Indian, Japanese, Middle Eastern, etc.\n\n[homemade] hummus I made today #### Middle Eastern\n[Homemade] Beef Shoyu Ramen #### Japanese\n[homemade] curry chicken, rice, & naan #### Indian\n[homemade] Chicken tacos and guacamole #### Mexican\n[Homemade] bacon and artichoke hearts pizza #### Italian\n"

# Hashtag for classification separation
hashtag = " #### "

# Batch size configuration
batch_size = 50

# Function to process each batch
def process_batch(batch_titles):
    input_texts = [instruction_text + title + hashtag for title in batch_titles]

    # Tokenize the batch with padding and truncation
    encoding = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    input_ids = encoding.input_ids.to(device)  # Move input_ids to the same device as the model
    attention_mask = encoding.attention_mask.to(device)  # Move attention_mask to the same device

    # Set max_length to the maximum token length in the batch (to avoid generating too many tokens)
    max_lengths = [token_length(input_text) + 3 for input_text in input_texts]
    max_length = max(max_lengths)  # Get the maximum length for generation

    # Generate output from the model
    generated_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        do_sample=True,
        eos_token_id=tokenizer.encode("####")[0],
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode results and classify them
    batch_results = []
    for generated_id in generated_ids:
        generated_text = tokenizer.decode(generated_id, skip_special_tokens=True)

        # Extract the classification from the generated text
        if hashtag in generated_text:
            split_text = generated_text.split(hashtag)
            if len(split_text) > 1 and split_text[-1].strip():  # Check presence of text after "####"
                classification = split_text[-1].split()[0]
            else:
                classification = "unknown"  # Fallback if no classification found
        else:
            classification = "unknown"  # Fallback if "####" not found

        batch_results.append(classification)

    return batch_results

# Divide the titles into batches
num_batches = ceil(len(titles) / batch_size)
results = []

# Start processing batches
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(titles))
    batch_titles = titles[start:end]

    if i == 0:
        # Process the first batch and estimate time for remaining batches
        start_time = time.time()
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)
        first_batch_time = time.time() - start_time
        average_time = first_batch_time * num_batches
        print(f"First batch time: {first_batch_time:.2f} seconds")
        print(f"Estimated time for all batches: {average_time:.2f} seconds")
    else:
        # Process subsequent batches
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)

    # Clear GPU memory after each batch to avoid memory fragmentation
    torch.cuda.empty_cache()
    gc.collect()

    print(f"Processed batch {i + 1}/{num_batches}")

# Output the final results
print(results)


First batch time: 5.78 seconds
Estimated time for all batches: 3263.58 seconds
Processed batch 1/565
Processed batch 2/565
Processed batch 3/565
Processed batch 4/565
Processed batch 5/565
Processed batch 6/565
Processed batch 7/565
Processed batch 8/565
Processed batch 9/565
Processed batch 10/565
Processed batch 11/565
Processed batch 12/565
Processed batch 13/565
Processed batch 14/565
Processed batch 15/565
Processed batch 16/565
Processed batch 17/565
Processed batch 18/565
Processed batch 19/565
Processed batch 20/565
Processed batch 21/565
Processed batch 22/565
Processed batch 23/565
Processed batch 24/565
Processed batch 25/565
Processed batch 26/565
Processed batch 27/565
Processed batch 28/565
Processed batch 29/565
Processed batch 30/565
Processed batch 31/565
Processed batch 32/565
Processed batch 33/565
Processed batch 34/565
Processed batch 35/565
Processed batch 36/565
Processed batch 37/565
Processed batch 38/565
Processed batch 39/565
Processed batch 40/565
Processed 

In [None]:
# writing the results into a csv file

import csv
from google.colab import drive

drive.mount('/content/gdrive')

list1 = authors
list2 = results
list3 = ids

# Nome del file CSV
file_csv = r"/content/gdrive/MyDrive/dati_sna/ethnic_titles_2022_split_2.csv"

# write on the csv
with open(file_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["author", "classification", "id"])
    i=0
    # writing rows
    for element1, element2, element3 in zip(list1, list2, list3):
        writer.writerow([element1, element2, element3])

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## SPLIT 3

In [None]:
from math import ceil
import time
import torch
import gc

# to change in other splits
titles=titles3
authors=authors3
ids=ids3

# Function to calculate token length
def token_length(input_text):
    tokens = tokenizer(input_text, return_tensors="pt").input_ids
    return tokens.shape[-1]

# Set padding token and padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Ensure correct padding for decoder-only architectures

# Defining the input instruction text
instruction_text = "Classify the following titles into the most probable cuisine ethnicity: Italian, American, Mexican, French, Chinese, Indian, Japanese, Middle Eastern, etc.\n\n[homemade] hummus I made today #### Middle Eastern\n[Homemade] Beef Shoyu Ramen #### Japanese\n[homemade] curry chicken, rice, & naan #### Indian\n[homemade] Chicken tacos and guacamole #### Mexican\n[Homemade] bacon and artichoke hearts pizza #### Italian\n"

# Hashtag for classification separation
hashtag = " #### "

# Batch size configuration
batch_size = 50

# Function to process each batch
def process_batch(batch_titles):
    input_texts = [instruction_text + title + hashtag for title in batch_titles]

    # Tokenize the batch with padding and truncation
    encoding = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    input_ids = encoding.input_ids.to(device)  # Move input_ids to the same device as the model
    attention_mask = encoding.attention_mask.to(device)  # Move attention_mask to the same device

    # Set max_length to the maximum token length in the batch (to avoid generating too many tokens)
    max_lengths = [token_length(input_text) + 3 for input_text in input_texts]
    max_length = max(max_lengths)  # Get the maximum length for generation

    # Generate output from the model
    generated_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        do_sample=True,
        eos_token_id=tokenizer.encode("####")[0],
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode results and classify them
    batch_results = []
    for generated_id in generated_ids:
        generated_text = tokenizer.decode(generated_id, skip_special_tokens=True)

        # Extract the classification from the generated text
        if hashtag in generated_text:
            split_text = generated_text.split(hashtag)
            if len(split_text) > 1 and split_text[-1].strip():  # Check presence of text after "####"
                classification = split_text[-1].split()[0]
            else:
                classification = "unknown"  # Fallback if no classification found
        else:
            classification = "unknown"  # Fallback if "####" not found

        batch_results.append(classification)

    return batch_results

# Divide the titles into batches
num_batches = ceil(len(titles) / batch_size)
results = []

# Start processing batches
for i in range(num_batches):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(titles))
    batch_titles = titles[start:end]

    if i == 0:
        # Process the first batch and estimate time for remaining batches
        start_time = time.time()
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)
        first_batch_time = time.time() - start_time
        average_time = first_batch_time * num_batches
        print(f"First batch time: {first_batch_time:.2f} seconds")
        print(f"Estimated time for all batches: {average_time:.2f} seconds")
    else:
        # Process subsequent batches
        batch_results = process_batch(batch_titles)
        results.extend(batch_results)

    # Clear GPU memory after each batch to avoid memory fragmentation
    torch.cuda.empty_cache()
    gc.collect()

    print(f"Processed batch {i + 1}/{num_batches}")

# Output the final results
print(results)


First batch time: 4.22 seconds
Estimated time for all batches: 2385.83 seconds
Processed batch 1/565
Processed batch 2/565
Processed batch 3/565
Processed batch 4/565
Processed batch 5/565
Processed batch 6/565
Processed batch 7/565
Processed batch 8/565
Processed batch 9/565
Processed batch 10/565
Processed batch 11/565
Processed batch 12/565
Processed batch 13/565
Processed batch 14/565
Processed batch 15/565
Processed batch 16/565
Processed batch 17/565
Processed batch 18/565
Processed batch 19/565
Processed batch 20/565
Processed batch 21/565
Processed batch 22/565
Processed batch 23/565
Processed batch 24/565
Processed batch 25/565
Processed batch 26/565
Processed batch 27/565
Processed batch 28/565
Processed batch 29/565
Processed batch 30/565
Processed batch 31/565
Processed batch 32/565
Processed batch 33/565
Processed batch 34/565
Processed batch 35/565
Processed batch 36/565
Processed batch 37/565
Processed batch 38/565
Processed batch 39/565
Processed batch 40/565
Processed 

In [None]:
# writing the results into a csv file

import csv
from google.colab import drive

drive.mount('/content/gdrive')

list1 = authors
list2 = results
list3 = ids

# Nome del file CSV
file_csv = r"/content/gdrive/MyDrive/dati_sna/ethnic_titles_2022_split_3.csv"

# write on the csv
with open(file_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["author", "classification", "id"])
    i=0
    # writing rows
    for element1, element2, element3 in zip(list1, list2, list3):
        writer.writerow([element1, element2, element3])

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Split Combining

In [None]:
import pandas as pd

# Percorsi dei file CSV
dati_sna_path = "/content/gdrive/MyDrive/dati_sna/"
file_names = [
    "ethnic_titles_2022_split_1.csv",
    "ethnic_titles_2022_split_2.csv",
    "ethnic_titles_2022_split_3.csv"
]

# Leggi e unisci i file
combined_df = pd.concat([pd.read_csv(dati_sna_path + file) for file in file_names])

# Salva il file unito
output_file = dati_sna_path + "ethnic_titles_2022.csv"
combined_df.to_csv(output_file, index=False)

print(f"File unito salvato in: {output_file}")


File unito salvato in: /content/gdrive/MyDrive/dati_sna/ethnic_titles_2022.csv
