In [106]:
import pandas as pd
import numpy as np

import openai
import os
from tqdm import tqdm
import time

import sys

sys.path.insert(0, '..')

from src.features.youtubeCleaner import Cleaner

In [172]:
def create_binary_blockchain_prompt(snippet):
    """Creates a binary prediction given a video snippet for a blockchain task.

    Parameters
    ----------
    snippet : str
        The title, summarized transcript, and tags of a video concatenated together

    Returns
    -------
    str
    """
    
    prompt = "I am a YouTube video classifier. Provide me with a video snippet (title + summarized transcript + tags) and I will analyze if the video recommends or teaches about blockchain investments(bitcoin, NFTs, Ethereum, etc). I respond only with Yes and No.\n\nExamples:\nSnippet: Invest in Index Funds. You should invest in index funds. stocks investing\nAnswer: No\n\nSnippet: Buy Crypto. You should invest in bitcoin. crypto invest\nAnswer: Yes\n\nHere is the actual task:\nSnippet: "
    
    prompt += snippet.replace("\n", " ").replace("  ", " ") + "\nAnswer:"
    
    return prompt
        

In [219]:
def create_binary_traditional_prompt(snippet):
    """Creates a binary prediction given a video snippet for a traditional task.

    Parameters
    ----------
    snippet : str
        The title, summarized transcript, and tags of a video concatenated together

    Returns
    -------
    str
    """
    
    prompt="I am a YouTube video classifier. Provide me with a video snippet (title + summarized transcript + tags) and I will analyze if the video recommends or teaches about traditional investments (stocks, bonds,  commodities, real estate, etc). I respond only with Yes and No.\n\nExamples:\nSnippet: Invest in Index Funds. You should invest in index funds. stocks investing\nAnswer: Yes\n\nSnippet: Buy Crypto. You should invest in bitcoin. crypto invest\nAnswer: No\n\nHere is the actual task:\nSnippet: "
    
    prompt += snippet.replace("\n", " ").replace("  ", " ") + "\nAnswer:"
    
    return prompt

In [168]:
def create_prompt(snippet):
    """Creates a prompt given a video snippet.

    Parameters
    ----------
    snippet : str
        The title, summarized transcript, and tags of a video concatenated together

    Returns
    -------
    str
        The prompt
    """
    
    prompt="I am a YouTube video classifier that takes in video snippets (title + shortened transcript + tags) and outputs one of the following labels if the video recommends or teaches about:\n\n1. Blockchain: Cryptocurrency, NFTs, or anything related to the blockchain\n2. Traditional: Stocks, Bonds, Real Estate, Commodities\n3. Mixed: Both blockchain and traditional investments\n4. None: Not related to the labels above.\n\nExamples:\nSnippet: Invest in Index Funds. You should invest in index funds. stocks investing\nAnswer: Traditional\n\nSnippet: Buy Crypto. Bitcoin is the best crypto to invest in. crypto invest\nAnswer: Blockchain\n\nSnippet: Investing Tips: Balance your portfolio with stocks and bitcoin. investing finance\nAnswer: Mixed\n\nSnippet: Cute Cat TikToks: Here are some of the cutest cat TikToks. tiktok cats\nAnswer: None\n\nSnippet: "
    
    # Clean off any additional new lines we don't need
    prompt += snippet.replace("\n", " ").replace("  ", " ") + '\nAnswer:'   
    
    return prompt

In [87]:
# Function to make classification requests
def classify(prompt, model_engine = "text-davinci-003", max_tokens = 1024, n = 1, temperature = 0.25):
    """Queries the OpenAI Davinci model using the prompt created.

    Parameters
    ----------
    prompt: str
        The prompt we want to feed to our model
    model_engine : str
        The model we want to query. Default text-davinci-003
    max_tokens: str
        The max amount of tokens the model will ingest/return
    n : int
        Number of completions we want generated. Default 1
    temperature: float
        The amount of risk the model will take. Lower temperature results in a more deterministic model. 0 temperature always outputs the same output. Default 0.25

    Returns
    -------
    OpenAIObject text_completion
        Our completion
    """
    completions = openai.Completion.create(
        engine=model_engine,
        prompt=prompt,
        max_tokens=max_tokens,
        n=n,
        stop=None,
        temperature=temperature,
    )
    return completions

In [88]:
def extract_prediction(completion):
    """Removes the prediction from the completion object.

    Parameters
    ----------
    OpenAIObject text_completion
        Our completion

    Returns
    -------
    str
        Predicted label
    """
    
    return completion.choices[0].text.strip()

In [89]:
def token_cost(completion, model_cost):
    """Removes the prediction from the completion object.

    Parameters
    ----------
    completion : OpenAIObject text_completion
        Our completion
    model_cost : float
        Cost per token for selected model engine

    Returns
    -------
    dict
        Report on how many tokens we used
    """
    
    
    tokens = dict(completion.usage)
    tokens['cost'] = tokens['total_tokens'] * model_cost
    
    return tokens

In [90]:
def format_output(completion, model_cost):
    """Formats our output for analysis

    Parameters
    ----------
    OpenAIObject text_completion
        Our completion
    model_cost : float
        Cost per token for selected model engine

    Returns
    -------
    dict
        Report on the output
    """
    
    output = token_cost(completion, model_cost)
    output['label'] = extract_prediction(completion)
    
    return output 

In [91]:
DAVINCI_COST = 0.0200 / 1000

In [92]:
df = pd.read_csv('../data/seed/youtube/seed_videos.csv')

In [93]:
# Initialize the API client
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [185]:
completion_one = classify(createPrompt(df['snippet'][0]))

In [186]:
format_output(completion_one, DAVINCI_COST)

{'prompt_tokens': 562,
 'completion_tokens': 1,
 'total_tokens': 563,
 'cost': 0.011260000000000001,
 'label': 'Traditional'}

Open AI Rate Limits
- 20 RPM
- 150,000 TPM

In [169]:
def classify_snippets(df):
    
    outputs = []
    predictions = []
    
    # Create progress bar to track classification process
    pbar = tqdm(df.iterrows())
    
    for idx, entry in pbar:
        pbar.set_description("Processing %s" % entry['title'])
        output = format_output(classify(create_prompt(entry['snippet'])), DAVINCI_COST)
        
        outputs.append(output)
        predictions.append(output['label'])

        
        time.sleep(3.25) # We are limited to 20 requests per minute
        
    return outputs, predictions

In [139]:
outputs, predictions = classify_snippets(df)

Processing NFTs, Explained: : 102it [06:36,  3.89s/it]                                                                                    


In [148]:
token_usage = pd.DataFrame(outputs)

In [152]:
token_usage['cost'].sum(), token_usage['total_tokens'].sum()

(1.2229400000000004, 61147)

In [156]:
mapping = {'traditional' : 0, 'blockchain': 1, 'mixed': 2, 'unrelated': 3}

In [158]:
actual = df['label'].apply(lambda x: mapping[x])

In [160]:
mapping = {'Traditional' : 0, 'Blockchain': 1, 'Mixed': 2, 'None': 3}

In [162]:
predictions = token_usage['label'].apply(lambda x: mapping[x])

In [180]:
from sklearn.metrics import accuracy_score, classification_report

In [166]:
accuracy_score(actual, predictions)

0.5686274509803921

In [182]:
print(classification_report(actual, predictions))

              precision    recall  f1-score   support

           0       0.92      0.55      0.69        40
           1       1.00      0.57      0.73        42
           2       0.14      1.00      0.24         5
           3       0.41      0.47      0.44        15

    accuracy                           0.57       102
   macro avg       0.62      0.65      0.52       102
weighted avg       0.84      0.57      0.65       102



In [167]:
token_usage.to_csv('baseline_classifier_results.csv')

In [187]:
def classify_binary_blockchain_snippets(df):
    
    outputs = []
    predictions = []
    
    # Create progress bar to track classification process
    pbar = tqdm(df.iterrows())
    
    for idx, entry in pbar:
        pbar.set_description("Processing %s" % entry['title'])
        output = format_output(classify(create_binary_blockchain_prompt(entry['snippet'])), DAVINCI_COST)
        
        outputs.append(output)
        predictions.append(output['label'])

        
        time.sleep(5) # We are limited to 20 requests per minute
        
    return outputs, predictions

In [188]:
def classify_binary_traditional_snippets(df):
    
    outputs = []
    predictions = []
    
    # Create progress bar to track classification process
    pbar = tqdm(df.iterrows())
    
    for idx, entry in pbar:
        pbar.set_description("Processing %s" % entry['title'])
        output = format_output(classify(create_binary_traditional_prompt(entry['snippet'])), DAVINCI_COST)
        
        outputs.append(output)
        predictions.append(output['label'])

        
        time.sleep(5) # We are limited to 20 requests per minute
        
    return outputs, predictions

In [189]:
blockchain_outputs, blockchain_predictions = classify_binary_blockchain_snippets(df)

Processing NFTs, Explained: : 102it [09:19,  5.48s/it]                                                                                    


In [232]:
traditional_outputs, traditional_predictions = classify_binary_traditional_snippets(df)

Processing NFTs, Explained: : 102it [09:34,  5.63s/it]                                                                                    


In [233]:
blockchain_results = pd.DataFrame(blockchain_outputs)

In [234]:
blockchain_results['cost'].sum()

1.03322

In [235]:
traditional_results = pd.DataFrame(traditional_outputs)

In [236]:
traditional_results['cost'].sum()

1.0372999999999999

In [237]:
traditional_results.to_csv("../data/external/baseline_binary_traditional_results.csv")

In [238]:
blockchain_results.to_csv("../data/external/baseline_binary_blockchain_results.csv")

In [257]:
traditional_preds = np.array(traditional_results['label'].apply(lambda x : 1 if x == 'Yes' else 0))

In [258]:
blockchain_preds = np.array(blockchain_results['label'].apply(lambda x : 2 if x == 'Yes' else 0))

In [263]:
mapping = {'unrelated' : 0, 'traditional': 1, 'blockchain': 2, 'mixed': 3}
actual = np.array(df['label'].apply(lambda x: mapping[x]))

In [264]:
preds = traditional_preds + blockchain_preds

In [265]:
accuracy_score(actual, preds)

0.7647058823529411

In [266]:
print(classification_report(actual, preds))

              precision    recall  f1-score   support

           0       0.50      0.60      0.55        15
           1       0.85      0.97      0.91        40
           2       0.93      0.67      0.78        42
           3       0.25      0.40      0.31         5

    accuracy                           0.76       102
   macro avg       0.63      0.66      0.63       102
weighted avg       0.80      0.76      0.77       102

