In [2]:
!pip install praw mlflow kserve tenacity



In [20]:
import kfp
import os
from kfp.v2.dsl import importer, Metrics
from kfp.dsl import Input, component
from kfp.dsl import OutputPath, pipeline
from kfp import dsl
from kfp.dsl import Dataset, Output


client = kfp.Client()

# Downloading Data
----

This component downloads data from reddit and saves them to the datastore for down stream tasks. 

It is ran daily to collect the daily limit

In [4]:

@component(
    base_image="python:3.11",
    packages_to_install=["praw"]
)
def download_dataset(dataset_path: OutputPath('Dataset')) -> None:
    import praw
    from pathlib import Path
    import json
    import os
    reddit = praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_PW'),
        user_agent="YOUR_USER_AGENT",
        # Optional: For authenticated requests
        username=os.getenv('REDDIT_USER'),
        password=os.getenv('REDDIT_PW')
    )
    subreddit_names = [
        "funny",
        "AskReddit",
        "gaming",
        "worldnews",
        "todayilearned",
        "Music",
        "aww",
        "movies",
        "memes",
        "science"
    ]
    data_path = Path(dataset_path)
    data_path.mkdir(exist_ok=True)

    for subreddit_name in subreddit_names:
        print(subreddit_name)
        subreddit = reddit.subreddit(subreddit_name)
        for submission in subreddit.top(time_filter="day", limit=10):
        
            most_upvoted_comment = None
            highest_score = -1
            
            for comment in submission.comments.list():
                if not hasattr(comment, "author") or not comment.author:  # Skip deleted comments
                    continue
                if comment.score > highest_score:
                    highest_score = comment.score
                    most_upvoted_comment = comment
            text = submission.selftext_html
            if most_upvoted_comment is not None:
                comment_text = most_upvoted_comment.body.strip()
                comment_score = most_upvoted_comment.score
            else:
                comment_text = ""
                comment_score = 0
            url = submission.url
            title = submission.title
            id  = submission.id
            print("\tid")
            submission_data = {
                "id": id,
                "title": title,
                "url": url,
                "text": text,
                "top_comment": comment_text,
                "comment_score": comment_score
            }
            with (data_path / f"{id}.json").open("w+") as f:
                json.dump(submission_data, f)


In [56]:
ISVC_NAME = "reddit-model"
MLFLOW_RUN_NAME = "reddit_models"
MLFLOW_MODEL_NAME = "reddit-model"

# mlflow_tracking_uri = os.getenv('MLFLOW_TRACKING_URI')
# mlflow_s3_endpoint_url = os.getenv('MLFLOW_S3_ENDPOINT_URL')
# aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
# aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')

client_id=os.getenv('REDDIT_CLIENT_ID')
client_secret=os.getenv('REDDIT_CLIENT_PW')
# Optional: For authenticated requests
username=os.getenv('REDDIT_USER')
password=os.getenv('REDDIT_PW')

@pipeline(name='download-reddit')
def download_preprocess_train_deploy_pipeline():
    download_task = download_dataset(
    ).set_env_variable(name='REDDIT_CLIENT_ID', value=client_id) \
    .set_env_variable(name='REDDIT_CLIENT_PW', value=client_secret) \
    .set_env_variable(name='REDDIT_USER', value=username) \
    .set_env_variable(name='REDDIT_PW', value=password)
    print(download_task)

## Manually run download task

---

Useful for testing the downloading of data

In [12]:
# run = client.create_run_from_pipeline_func(download_preprocess_train_deploy_pipeline, arguments={}, enable_caching=False)

## Register pipeline

---

Used to schedule the task

In [58]:
# kfp.compiler.Compiler().compile(download_preprocess_train_deploy_pipeline, package_path='pipeline.yaml')
# client.upload_pipeline("pipeline.yaml", 'download-reddit')

# Train Model
----

Train a model using GPU. A customizable number of epochs, learning rate, and 

In [22]:

@component(
    base_image="wallies/python-cuda:3.10-cuda11.6-runtime",
    packages_to_install=["ajperry_pipeline"]
)
def train_model(
    input_dataset: Input[Dataset], 
    num_epochs: int,
    batch_size: int,
    learning_rate: float,
    metrics: Output[Metrics]
):
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader
    
    from ajperry_pipeline.ml.models import Transformer
    from ajperry_pipeline.ml.data.reddit import RedditDataset
    

    device="cuda"
    
    criterion = nn.BCEWithLogitsLoss(reduction='none')
    
    training_dataset = RedditDataset(
        input_dataset.path,
        is_train=True, 
        train_split_perc=0.8
    )
    test_dataset = RedditDataset(
        input_dataset.path,
        is_train=False, 
        train_split_perc=0.8
    )
    print(f"Training Samples: {len(training_dataset)}")
    print(f"Testing Samples: {len(test_dataset)}")
    
    train_dataloader = DataLoader(
        training_dataset,
        batch_size=batch_size,
        shuffle=True
    )
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False
    )
    transformer = Transformer(
        embedding_model="bert-base-uncased", 
        num_heads= 2,
        num_encoders= 6,
        num_nn_layers = 3,
        embedding_size = 128,
        device = device,
        max_length = 20
    )
    optimizer = optim.SGD(transformer.parameters(), lr=learning_rate)
    
    transformer = transformer.to(device)
    
    criterion = nn.BCEWithLogitsLoss(reduction='none')
    optimizer = optim.SGD(transformer.parameters(), lr=0.01)
    
    # Training loop (one optimization step shown)
    for epoch in range(num_epochs): # Illustrative single epoch
        train_losses = []
        test_losses = []
        transformer.train()
        for titles, top_comments in train_dataloader:
            # 1. Forward pass
            output_logits, outputs, ended = transformer(titles)
            actuals = torch.tensor([transformer.tokenizer.encode(top_comments[i],truncation=True, padding='max_length', max_length=transformer.max_length) for i in range(len(top_comments))])
            actuals_onehot = torch.nn.functional.one_hot(actuals, num_classes=transformer.tokenizer.vocab_size)
            logits = torch.stack([torch.stack(logit) for logit in output_logits])
            # Calculate the loss
            loss = criterion(logits, actuals_onehot.float().to(device))
            for i, e in enumerate(ended):
                loss[i,e:] = 0.0
            loss = loss.mean()
            optimizer.zero_grad()
            loss.backward()
            # # 4. Update parameters
            optimizer.step()
            train_losses.append(loss.item())
        transformer.eval()
        with torch.no_grad():
            for titles, top_comments in test_dataloader:
                # 1. Forward pass
                output_logits, outputs, ended = transformer(titles)
                actuals = torch.tensor([transformer.tokenizer.encode(top_comments[i],truncation=True, padding='max_length', max_length=transformer.max_length) for i in range(len(top_comments))])
                actuals_onehot = torch.nn.functional.one_hot(actuals, num_classes=transformer.tokenizer.vocab_size)
                logits = torch.stack([torch.stack(logit) for logit in output_logits])
                # Calculate the loss
                loss = criterion(logits, actuals_onehot.float().to(device))
                for i, e in enumerate(ended):
                    loss[i,e:] = 0.0
                loss = loss.mean()
                test_losses.append(loss.item())
        average_train_loss = sum(train_losses) / len(train_losses)
        average_test_loss = sum(test_losses) / len(test_losses)
        metrics.log_metric('train_bce', average_loss)
        metrics.log_metric('test_bce', average_test_loss)

@dsl.pipeline(name='model-train-pipeline')
def train_model_pipeline(
    dataset_uri: str,
    num_epochs: int,
    batch_size: int,
    learning_rate: float,
):
    # Import the dataset artifact from the specified URI
    import os
    # Import the dataset artifact from the specified URI
    imported_dataset_task = importer(
        artifact_uri=dataset_uri,
        artifact_class=Dataset,
        reimport=False # Set to True if you want a new ML Metadata entry
    )
    train_model(
        input_dataset=imported_dataset_task.outputs['artifact'],
        num_epochs=num_epochs,
        batch_size=batch_size,
        learning_rate=learning_rate,
    ).set_env_variable(name='HF_TOKEN2', value=os.getenv('HF_TOKEN2')) \
    .set_gpu_limit(1)


## Manually run training task

---


In [24]:
run = client.create_run_from_pipeline_func(
    train_model_pipeline, 
    arguments={
        "dataset_uri":"minio://mlpipeline/v2/artifacts/download-reddit/c1d12a96-5baf-410d-9e95-83333f005439/download-dataset/0dcdadc5-eb72-49ee-8e65-e4e7547eae86/dataset_path",
        "num_epochs": 100,
        "batch_size": 1,
        "learning_rate": 0.01,
    }, 
    enable_caching=False)