In [1]:
!pip install -q transformers accelerate peft bitsandbytes datasets kaggle


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
from google.colab import files

os.makedirs("/root/.kaggle", exist_ok=True)

uploaded = files.upload()
for fn in uploaded.keys():
    if fn == "kaggle.json":
        os.rename("kaggle.json", "/root/.kaggle/kaggle.json")

!chmod 600 /root/.kaggle/kaggle.json

# Download the anime dataset
!kaggle datasets download -d lorentzyeung/all-japanese-anime-titles-in-imdb -p ./data
!unzip -o ./data/all-japanese-anime-titles-in-imdb.zip -d ./data


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/lorentzyeung/all-japanese-anime-titles-in-imdb
License(s): ODbL-1.0
Downloading all-japanese-anime-titles-in-imdb.zip to ./data
  0% 0.00/3.35M [00:00<?, ?B/s]
100% 3.35M/3.35M [00:00<00:00, 772MB/s]
Archive:  ./data/all-japanese-anime-titles-in-imdb.zip
  inflating: ./data/imdb_anime.csv   


In [3]:
import pandas as pd
import os

# Detect the CSV file
csv_files = [f for f in os.listdir("./data") if f.endswith(".csv")]
csv_path = "./data/" + csv_files[0]
df = pd.read_csv(csv_path)

print(df.head())
print("Columns:", df.columns.tolist())

def row_to_text(row):
    title = str(row.get("primaryTitle", row.get("title", "")))
    year  = str(row.get("startYear", ""))
    genres = str(row.get("genres", "")).replace(",", ", ")
    rating = row.get("averageRating", "")

    line = f"{title} ({year}) is an anime with genres {genres}. It has an IMDb rating of {rating}."
    return line

corpus = "\n".join(df.apply(row_to_text, axis=1).tolist())
len(corpus)


                                         Title                         Genre  \
0                                    One Piece  Animation, Action, Adventure   
1  Teenage Mutant Ninja Turtles: Mutant Mayhem  Animation, Action, Adventure   
2                  The Super Mario Bros. Movie  Animation, Adventure, Comedy   
3                              Attack on Titan  Animation, Action, Adventure   
4                               Jujutsu Kaisen  Animation, Action, Adventure   

  User Rating Number of Votes Runtime         Year  \
0         8.9         187,689  24 min     (1999– )   
1         7.4          28,895  99 min       (2023)   
2         7.1         189,108  92 min       (2023)   
3         9.1         434,457  24 min  (2013–2023)   
4         8.5          82,909  24 min     (2020– )   

                                             Summary  \
0  Follows the adventures of Monkey D. Luffy and ...   
1  The film follows the Turtle brothers as they w...   
2  A plumber named Mario t

2605868

In [4]:
from datasets import Dataset

dataset = Dataset.from_dict({"text": corpus.split("\n")})
dataset


Dataset({
    features: ['text'],
    num_rows: 45717
})

In [5]:
model_name = "Qwen/Qwen1.5-0.5B"

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [6]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 3,145,728 || all params: 467,133,440 || trainable%: 0.6734


In [7]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True)

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])
tokenized


Map:   0%|          | 0/45717 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 45717
})

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./weebGPT",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=20,
    save_steps=500,
    fp16=True,
    optim="paged_adamw_32bit"
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator,
)


In [None]:
        trainer.train()
model.save_pretrained("./weebGPT")
tokenizer.save_pretrained("./weebGPT")


In [None]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model="./weebGPT",
    tokenizer="./weebGPT",
    device_map="auto",
    max_new_tokens=200,
)

def ask_weeb(question):
    prompt = f"You are WeebGPT, the ultimate anime expert. {question}"
    out = pipe(prompt)[0]["generated_text"]
    return out

print(ask_weeb("Explain why Attack on Titan is so popular."))
print(ask_weeb("Recommend anime similar to Cowboy Bebop."))
print(ask_weeb("Describe the plot of a new original mecha anime."))
