In [1]:
import pandas as pd
import numpy as np
import random

import openai
import os
from tqdm import tqdm
import time

import sys

sys.path.insert(0, '..')

In [None]:
# TURBO COST
TURBO_COST = 0.00200 / 1000

In [None]:
# Initialize the API client
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [None]:
df = pd.read_csv('../data/seed/youtube/seed_videos.csv')

In [3]:
def create_traditional_messages(snippet):
    messages = [
        {"role": "system", "content" : "You are a binary classifier that determines if a YouTube video snippet falls under a label. A snippet is a concatenation of the video title, summarized transcript, and video tags. The label and additional instructions will be included in the first user message."},
        {"role": "user", "content" : """Label:

A video snippet is considered 'Traditional" if it recommends or educates about stocks, bonds, real estate, commodities, retirement accounts, or other traditional investments or keywords related to them.

A video snippet may talk about making money with traditional investment strategies, but if it recommends a non-traditional investment asset (such as a side hustle, watch, cryptocurrency, etc.), the video is not traditional.

Lastly, if a video mentions a traditional investment strategy, but the strategy is applied to a non-traditional asset, the video is not traditional.

Instructions:
- Predictions must be returned as 'Yes' or 'No', followed by a 20-word or shorter rationale.
- The classifier should consider the context and meaning of the keywords used to determine whether the snippet is considered 'Traditional'.."""},
        {"role": "assistant", "content": """Understood. I will classify YouTube video snippets based on the provided label and instructions. Here's how I will format the predictions:

{Yes/No}: {20-word or shorter rationale}.

Please provide me with the YouTube video snippet you would like me to classify."""},
    ]
    
    snippet_message = {"role" : "user", "content" : snippet.replace("\n", " ").replace("  ", " ")}
    
    messages.append(snippet_message)
    
    return messages

In [4]:
def create_blockchain_messages(snippet):
    messages = [
        {"role": "system", "content" : "You are a binary classifier that determines if a YouTube video snippet falls under a label. A snippet is a concatenation of the video title, summarized transcript, and video tags. The label and additional instructions will be included in the first user message."},
        {"role": "user", "content" : """Label:

A video snippet is considered 'Blockchain" if it recommends or educates about cryptocurrency (BTC, ETH, etc.), NFTs, or other Web3 investments or keywords related to them.

A video snippet may talk about making money with blockchain investment strategies, but if it recommends a non-blockchain investment asset (such as a side hustle, watch, stocks, bonds, etc.), the video is not blockchain.

Lastly, if a video mentions a blockchain investment strategy, but the strategy is applied to a non-blockchain asset, the video is not blockchain.

Instructions:
- Predictions must be returned as 'Yes' or 'No', followed by a 20-word or shorter rationale.
- The classifier should consider the context and meaning of the keywords used to determine whether the snippet is considered 'Traditional'."""},
        {"role": "assistant", "content": """Understood. I will classify YouTube video snippets based on the provided label and instructions. Here's how I will format the predictions:

{Yes/No}: {20-word or shorter rationale}.

Please provide me with the YouTube video snippet you would like me to classify."""},
    ]
    
    snippet_message = {"role" : "user", "content" : snippet.replace("\n", " ").replace("  ", " ")}
    
    messages.append(snippet_message)
    
    return messages

In [None]:
def classify(messages):
    chatCompletion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=messages,
      temperature=0.25
    )
    return chatCompletion

In [None]:
df['messages'] = df['snippet'].apply(create_messages)

In [None]:
completions = []
completion_df = []

pbar = tqdm(df.iterrows())  

for idx, entry in pbar:
    pbar.set_description("Processing %s" % entry['title'])
    
    body = classify(entry['messages'])
    
    ## Add error catching
    completion = dict(body['choices'][0]['message'])
    completion['prediction']  = completion['content'].split(':')[0].strip()
    completion['reason'] = completion['content'].split(':')[1].strip()
    completion['message'] = entry['messages']
    completion['title'] = entry['title']
    completion['transcript'] = entry['cleaned_transcript']
    completion['snippet'] = entry['snippet']
    completion['link'] = entry['link']

    ## Grab meta data
    completion.update({key: body[key] for key in ['created', 'id', 'model', 'object']})
    
    ## Grab token usage
    completion.update(dict(body['usage']))

    completions.append(completion)
    
    pd.DataFrame(completions).to_csv('seed_predictions.csv', index_label = False)
    
    
    time.sleep(5)

In [None]:
predictions = pd.read_csv('seed_predictions.csv')

In [None]:
preds = list(predictions['prediction'].apply(lambda x: 'None' if x == 'Unrelated' else x))
actual = list(df['label'])
preds = ['Blockchain' if pred == 'Label' else pred for pred in preds]
sum(predictions['total_tokens']) * TURBO_COST

In [None]:
print(accuracy_score(actual, preds))
print(classification_report(actual, preds))