In [2]:
import pandas as pd
import numpy as np
import random

import openai
import os
from tqdm import tqdm
import time

import sys

from sklearn.metrics import accuracy_score, classification_report

sys.path.insert(0, '..')

In [2]:
# TURBO COST
TURBO_COST = 0.00200 / 1000

In [3]:
# Initialize the API client
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [4]:
finance = pd.read_csv('../data/seed/youtube/seed_videos.csv')
young = pd.read_csv('../data/seed/youtube/young_videos.csv')
old = pd.read_csv('../data/seed/youtube/old_videos.csv')

In [6]:
df = pd.concat([finance, young, old])

In [12]:
df['label'] = df['label'].apply(lambda x: 'Unrelated' if x == 'old' else 'Unrelated' if x == 'young' else 'Unrelated' if x == 'None' else x)

In [8]:
def create_messages(snippet):
    messages = [
        {"role": "system", "content" : "You are a classifier that determines if a YouTube video snippet falls under a label. A snippet is a concatenation of the video title, summarized transcript, and video tags. The labels and additional instructions will be included in the first user message."},
        {"role": "user", "content" : """Labels:

Traditional: Videos that recommend or educate about stocks, bonds, real estate, commodities, retirement accounts, or other traditional investments or keywords related to them.
Blockchain: Videos that recommend or educate about cryptocurrency (BTC, ETH, etc.), NFTs, or other Web3 investments or keywords related to them.
Mixed: Videos that recommend or educate about both blockchain and traditional investments or keywords related to both.
Unrelated: Videos that do not recommend or educate about either blockchain or traditional investments or keywords related to them.

Instructions:
- The classifier should consider the context and meaning of the keywords used to determine whether the snippet is related to traditional or blockchain investments.
- If talks about making money from jobs, side hustles, or other alternative assets (cars, watches, artificial intelligence, trading cards, art, etc), they are Unrelated.
- A video that is only downplaying an investment or discussing it negatively should be classified as Unrelated.
- Please return predictions in the format" {Label} : {20 word or shorter rationale}"""},
        {"role": "assistant", "content": """Understood. I will classify YouTube video snippets based on the provided labels and instructions. Here's how I will format the predictions:

{Label} : {20-word or shorter rationale}

Please provide me with the YouTube video snippet you would like me to classify."""},
    ]
    
    snippet_message = {"role" : "user", "content" : snippet.replace("\n", " ").replace("  ", " ")}
    
    messages.append(snippet_message)
    
    return messages

In [14]:
def classify(messages, temp = 0.25):
    chatCompletion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=messages,
      temperature=temp
    )
    return chatCompletion

In [16]:
df['messages'] = df['snippet'].apply(create_messages)

In [18]:
completions = []

pbar = tqdm(df.iterrows())  

for idx, entry in pbar:
    pbar.set_description("Processing %s" % entry['title'])
    
    body = classify(entry['messages'])
    
    ## Add error catching
    completion = dict(body['choices'][0]['message'])
    completion['prediction']  = completion['content'].split(':')[0].strip()
    completion['reason'] = completion['content'].split(':')[1].strip()
    completion['message'] = entry['messages']
    completion['title'] = entry['title']
    completion['transcript'] = entry['cleaned_transcript']
    completion['snippet'] = entry['snippet']
    completion['link'] = entry['link']

    ## Grab meta data
    completion.update({key: body[key] for key in ['created', 'id', 'model', 'object']})
    
    ## Grab token usage
    completion.update(dict(body['usage']))

    completions.append(completion)
    
    pd.DataFrame(completions).to_csv('seed_predictions.csv', index_label = False)
    
    
    time.sleep(5)

Processing How To Stay Healthy Until Your Old Age: : 140it [15:30,  6.65s/it]                                                             


In [19]:
predictions = pd.read_csv('seed_predictions.csv')

In [34]:
predictions['prediction'].value_counts()

Unrelated      50
Traditional    43
Blockchain     35
Mixed          12
Name: prediction, dtype: int64

In [28]:
preds = list(predictions['prediction'])
actual = list(df['label'])
preds = ['Blockchain' if pred == 'Label' else pred for pred in preds]
sum(predictions['total_tokens']) * TURBO_COST

0.208982

In [31]:
pd.set_option('display.max_rows', 500)

In [29]:
print(accuracy_score(actual, preds))
print(classification_report(actual, preds))

0.9071428571428571
              precision    recall  f1-score   support

  Blockchain       1.00      0.88      0.93        40
       Mixed       0.42      1.00      0.59         5
 Traditional       0.91      0.97      0.94        40
   Unrelated       0.96      0.87      0.91        55

    accuracy                           0.91       140
   macro avg       0.82      0.93      0.84       140
weighted avg       0.94      0.91      0.92       140



In [84]:
pred = 'Label :Blockchain bruh'

In [86]:
pred = pred.lower().strip('label').strip('label:').strip().strip(":").strip().split(" ")[0]

In [87]:
pred = pred[0].upper() + pred[1:]

In [88]:
pred

'Blockchain'

In [3]:

df = pd.read_csv("../data/audit/youtube/processed/snippets/downloaded_sidebar_mixed_old_with_snippets.csv")

In [12]:
create_messages(df.iloc[545]['snippet'])

[{'role': 'system',
  'content': 'You are a classifier that determines if a YouTube video snippet falls under a label. A snippet is a concatenation of the video title, summarized transcript, and video tags. The labels and additional instructions will be included in the first user message.'},
 {'role': 'user',
  'content': 'Labels:\n\nTraditional: Videos that recommend or educate about stocks, bonds, real estate, commodities, retirement accounts, or other traditional investments or keywords related to them.\nBlockchain: Videos that recommend or educate about cryptocurrency (BTC, ETH, etc.), NFTs, or other Web3 investments or keywords related to them.\nMixed: Videos that recommend or educate about both blockchain and traditional investments or keywords related to both.\nUnrelated: Videos that do not recommend or educate about either blockchain or traditional investments or keywords related to them.\n\nInstructions:\n- The classifier should consider the context and meaning of the keywords

In [15]:
classify(create_messages(df.iloc[545]['snippet']))

<OpenAIObject chat.completion id=chatcmpl-6sFOX8YKozXDvwqKq3UFBI3dvrCMT at 0x7fa9950ba2c0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "I'm sorry, but the provided input is not a valid YouTube video snippet. Please provide a valid YouTube video snippet for me to classify.",
        "role": "assistant"
      }
    }
  ],
  "created": 1678386909,
  "id": "chatcmpl-6sFOX8YKozXDvwqKq3UFBI3dvrCMT",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 30,
    "prompt_tokens": 346,
    "total_tokens": 376
  }
}