In [1]:
!pip install -q transformers accelerate peft bitsandbytes datasets kaggle


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
from google.colab import files

os.makedirs("/root/.kaggle", exist_ok=True)

uploaded = files.upload()
for fn in uploaded.keys():
    if fn == "kaggle.json":
        os.rename("kaggle.json", "/root/.kaggle/kaggle.json")

!chmod 600 /root/.kaggle/kaggle.json

# Download the anime dataset
!kaggle datasets download -d lorentzyeung/all-japanese-anime-titles-in-imdb -p ./data
!unzip -o ./data/all-japanese-anime-titles-in-imdb.zip -d ./data


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/lorentzyeung/all-japanese-anime-titles-in-imdb
License(s): ODbL-1.0
Downloading all-japanese-anime-titles-in-imdb.zip to ./data
  0% 0.00/3.35M [00:00<?, ?B/s]
100% 3.35M/3.35M [00:00<00:00, 509MB/s]
Archive:  ./data/all-japanese-anime-titles-in-imdb.zip
  inflating: ./data/imdb_anime.csv   


In [3]:
import pandas as pd
import os

# Detect the CSV file
csv_files = [f for f in os.listdir("./data") if f.endswith(".csv")]
csv_path = "./data/" + csv_files[0]
df = pd.read_csv(csv_path)

print(df.head())
print("Columns:", df.columns.tolist())

def row_to_text(row):
    title = str(row.get("primaryTitle", row.get("title", "")))
    year  = str(row.get("startYear", ""))
    genres = str(row.get("genres", "")).replace(",", ", ")
    rating = row.get("averageRating", "")

    line = f"{title} ({year}) is an anime with genres {genres}. It has an IMDb rating of {rating}."
    return line

corpus = "\n".join(df.apply(row_to_text, axis=1).tolist())
len(corpus)


                                         Title                         Genre  \
0                                    One Piece  Animation, Action, Adventure   
1  Teenage Mutant Ninja Turtles: Mutant Mayhem  Animation, Action, Adventure   
2                  The Super Mario Bros. Movie  Animation, Adventure, Comedy   
3                              Attack on Titan  Animation, Action, Adventure   
4                               Jujutsu Kaisen  Animation, Action, Adventure   

  User Rating Number of Votes Runtime         Year  \
0         8.9         187,689  24 min     (1999– )   
1         7.4          28,895  99 min       (2023)   
2         7.1         189,108  92 min       (2023)   
3         9.1         434,457  24 min  (2013–2023)   
4         8.5          82,909  24 min     (2020– )   

                                             Summary  \
0  Follows the adventures of Monkey D. Luffy and ...   
1  The film follows the Turtle brothers as they w...   
2  A plumber named Mario t

2605868

In [4]:
from datasets import Dataset

dataset = Dataset.from_dict({"text": corpus.split("\n")})
dataset


Dataset({
    features: ['text'],
    num_rows: 45717
})

In [5]:
model_name = "Qwen/Qwen1.5-0.5B"

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [6]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 3,145,728 || all params: 467,133,440 || trainable%: 0.6734


In [7]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True)

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])
tokenized


Map:   0%|          | 0/45717 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 45717
})

In [8]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling,EarlyStoppingCallback
from transformers.trainer_utils import IntervalStrategy
training_args = TrainingArguments(
    output_dir="./weebGPT",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,               # keep small for small dataset
    learning_rate=5e-5,
    warmup_steps=50,
    logging_steps=20,

    eval_strategy=IntervalStrategy.STEPS, # Re-enabling evaluation strategy
    eval_steps=100, # Re-enabling eval steps

    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,

    fp16=True,
    optim="paged_adamw_32bit",

    # REQUIRED for early stopping:
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    report_to="none",
)
# Make a mini validation set (5% or so)
val_dataset = tokenized.select(range(int(len(tokenized) * 0.05)))

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], # Re-enabling early stopping
)


In [9]:
trainer.train()
model.save_pretrained("./weebGPT")
tokenizer.save_pretrained("./weebGPT")


Step,Training Loss,Validation Loss
100,0.0256,0.009701
200,0.0013,0.00116
300,0.0005,0.000511
400,0.0003,0.000304
500,0.0002,0.000202
600,0.0002,0.000145
700,0.0001,0.00011
800,0.0001,8.5e-05
900,0.0001,6.9e-05
1000,0.0001,5.7e-05


('./weebGPT/tokenizer_config.json',
 './weebGPT/special_tokens_map.json',
 './weebGPT/chat_template.jinja',
 './weebGPT/vocab.json',
 './weebGPT/merges.txt',
 './weebGPT/added_tokens.json',
 './weebGPT/tokenizer.json')

In [16]:
from transformers import pipeline, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "./weebGPT",
    fix_mistral_regex=True,
    legacy=False,
    use_fast=True
)

pipe = pipeline(
    "text-generation",
    model="./weebGPT",
    tokenizer=tokenizer,
    device_map="auto",
    max_new_tokens=200,
)

def ask_weeb(question):
    prompt = f"You are WeebGPT, the ultimate anime expert. {question}"
    out = pipe(prompt)[0]["generated_text"]
    return out

print(ask_weeb("Explain why Attack on Titan is so popular."))
print(ask_weeb("Recommend anime similar to Cowboy Bebop."))

Device set to use cuda:0


You are WeebGPT, the ultimate anime expert. Explain why Attack on Titan is so popular. What makes anime so popular? Why is it difficult to watch it? What are the main characters and how is it structured? What makes the anime so unique? Why is it so long? And finally, what are the main themes? Write an essay that answers these questions.
We are currently working on an essay that will be published on our blog. We will be posting a new article on the topic of Anime every week. We will be asking you to write an essay that answers the questions we posed earlier in the article. You will be given three weeks to write the essay. Once you have written your essay, we will ask you to provide feedback on it. We will then post your essay and your feedback to our blog.

The Anime is so popular because it combines elements of science fiction, action, and drama into a unique and engaging style. It has become the most popular television genre in Japan and has been translated into over 150 languages wor