In [1]:
# Imports
from autogluon.multimodal import MultiModalPredictor
import os
import pandas as pd
import tarfile
from pathlib import Path
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources


In [10]:
import gc

gc.collect()

0

In [7]:
# Data Processing

filename = "amazon_review_full_csv.tar.gz"
filepath = Path.cwd() / "data" / filename
columns = ["Label", "Title", "Description"]

with tarfile.open(filepath, "r:gz") as tar:

    test_file = tar.extractfile("amazon_review_full_csv/test.csv")
    df_test = pd.read_csv(test_file, header=None)

    train_file = tar.extractfile("amazon_review_full_csv/train.csv")
    df_train = pd.read_csv(train_file, header=None)

df_train = df_train.dropna().reset_index(drop=True)
df_test = df_test.dropna().reset_index(drop=True)

df_train.columns = columns
df_test.columns = columns

In [7]:
# Ordner-Erstellung

os.makedirs("./Plots", exist_ok=True)
os.makedirs("./AutoGluon_Models", exist_ok=True)


In [9]:
df_test.head(5)

Unnamed: 0,Label,Title,Description
0,1,mens ultrasheer,"This model may be ok for sedentary types, but ..."
1,4,Surprisingly delightful,This is a fast read filled with unexpected hum...
2,2,"Works, but not as advertised",I bought one of these chargers..the instructio...
3,2,Oh dear,I was excited to find a book ostensibly about ...
4,2,Incorrect disc!,"I am a big JVC fan, but I do not like this mod..."


In [None]:
# Training mit AutoGluon

os.makedirs("./AutoGluon_Models", exist_ok=True)

predictor = MultiModalPredictor(label="Label",
                                problem_type="classification",
                                path="./AutoGluon_Models",
                                presets="medium_quality",
                                verbosity=2,
                                enable_progress_bar=True,
                                eval_metric="accuracy",
                                use_ensemble=False)

predictor.fit(train_data=df_train,
              time_limit=60 * 60 * 24, # Begrenzt auf 24h Training
              seed=42,
              column_types={"Title": "text", "Description": "text"})


AutoGluon Version:  1.4.0
Python Version:     3.11.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          16
Pytorch Version:    2.4.1+cu121
CUDA Version:       12.1
GPU Count:          1
Memory Avail:       3.94 GB / 15.42 GB (25.6%)
Disk Space Avail:   170.39 GB / 453.16 GB (37.6%)
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == int, but few unique label-values observed).
	5 unique label values:  [np.int64(3), np.int64(5), np.int64(4), np.int64(1), np.int64(2)]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])

AutoMM starts to create your model. ✨✨✨

To track the learning progress, you can open a terminal and launch Tensorboard:
    ```shell
    # Assume you have installed tensorboard
    tensorboard --logdir C:\Users\achim\

                                                                           

In [3]:
# Evaluation der AutoGluon Models

predictor = MultiModalPredictor.load("./AutoGluon_Models")


Load pretrained checkpoint: C:\Users\achim\PycharmProjects\AutoGluon\AutoGluon_Models\model.ckpt


In [24]:
test_sample = df_test[["Title", "Description"]]

y_pred = predictor.predict(test_sample, as_pandas=True)

from sklearn.metrics import classification_report

report = classification_report(df_test["Label"], y_pred)


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


Predicting DataLoader 0: 100%|██████████| 20312/20312 [09:15<00:00, 36.58it/s]


In [25]:
from rich import print
print(report)