In [1]:
import pandas as pd
import numpy as np

import openai
from gensim.summarization import summarize

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import os

ModuleNotFoundError: No module named 'gensim.summarization'

In [None]:
df = pd.read_csv('../data/seed/youtube/seed_videos_1.csv')

In [None]:
df.head()

In [None]:
def applyTfidf(corpus, n):
    """Apply TF-IDF"""
    # Should I just make one of these?
    # Convert the text into a sparse matrix using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([corpus])
    
    # Get the feature names and scores
    feature_names = vectorizer.get_feature_names_out()
    scores = dict(zip(feature_names, tfidf.data))
    
    # Sort the scores in descending order
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # Select only the top N features
    selected_features = [x[0] for x in sorted_scores[:n]]
    
    return selected_features

In [None]:
def topTags(tags, n = 10):
    """Apply TF-IDF to the tags to get the ten most relevant tags"""
    corpus = " ".join([tag.strip().strip("\'") for tag in tags[1:-1].strip('[').strip(']').split(',')])
    
    selected_features = applyTfidf(corpus, n)
    
    return " ".join(selected_features)

In [None]:
def condenseTranscript(transcript, summary = False, tfidf = False, word_count = 100, n = 50):
    """Cleans and condenses transcript"""
    corpus = transcript.replace("\n", " ").replace(" - ", "").replace('- ', "").replace("\'", "").replace(".", ". ")
    if summary:
        if len(corpus) < word_count:
            return corpus
        elif len(corpus.split(".")) <= 1:
            return corpus
        return summarize(corpus, word_count = word_count)
    elif tfidf:
        if len(corpus) < word_count:
            return corpus
        elif len(corpus.split(".")) <= 1:
            return corpus
        return " ".join(applyTfidf(corpus, n)) + "."
    else:
        return corpus

In [None]:
def createVideoSnippet(title, transcript, tags):
    """Creates the video snippet"""
    return title + '. ' + transcript.replace("\n", " ") + ' ' + tags + "."

In [None]:
def createPrompt(videos):
    """Create prompt given video snippet list"""
#     prompt="These labels determine if a video is discussing or recommending the following investments:\nAlternative: Cryptocurrency, Blockchain, NFTs\nTraditional: Stocks, Bonds, Real Estate, Commodities\nMixed: Discusses at least one of each topic from alternative and traditional labels defined above.\nNone: Not related to investing or finance\n\nClassify these YouTube video snippets, with each snippet containing the title, transcript, and video tags.\n\nFor each video return probabilities for all of the labels and explain the most probable label.\n\nExample Output:\nAlternative: 0.15 Traditional 0.15 Mixed .1 None .6 None because lorem ipsum."
#     prompt="These labels determine if a video is discussing or recommending the following investments:\nAlternative: Cryptocurrency, Blockchain, NFTs\nTraditional: Stocks, Bonds, Real Estate, Commodities\nMixed: Must at least one of each topic from alternative and traditional labels defined above.\nNone: Doesn't discuss the topics above or related topics.\n\nFor each video return probabilities for all of the labels and explain the most probable label.\nExample Output:\nAlternative: 0.15 Traditional 0.15 Mixed .1 None .6 None because lorem ipsum.\n\nClassify each YouTube video snippet below, with each snippet containing the title, transcript, and video tags."
    prompt="These labels determine if a video is discussing or recommending the following investments:\n1. Alternative: Cryptocurrency, Blockchain, NFTs\n2. Traditional: Stocks, Bonds, Real Estate, Commodities\n3. Mixed: Discusses at least one of each topic from alternative and traditional labels\n4. None: Not related to investing or finance\n\nClassify these YouTube video snippets, with each snippet containing the title, transcript, tags."
    for i in range(len(videos)):
        prompt += '\n\n{} '.format(i+1) + videos[i]
    prompt += "\n\nFor each video return probabilities for all of the labels and explain the most probable label.\nExample Output:\nAlternative: 0.15 Traditional: 0.15 Mixed: .1 None: .6 None because lorem ipsum"
    
    return prompt

In [None]:
# Initialize the API client
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [None]:
# Function to make classification requests
def classify(prompt, model_engine = "text-davinci-002", max_tokens = 1024, n = 1, temperature = 0.5):
    completions = openai.Completion.create(
        engine=model_engine,
        prompt=prompt,
        max_tokens=max_tokens,
        n=n,
        stop=None,
        temperature=temperature,
    )
    return completions.choices[0]

In [None]:
videos = []

In [None]:
videos.append(createVideoSnippet(df['title'][0], condenseTranscript(df['cleaned_transcript'][0], summary = True), topTags(df['tags'][0])))
# videos.append(createVideoSnippet(df['title'][0], condenseTranscript(df['cleaned_transcript'][0], tfidf = True), topTags(df['tags'][0])))

In [None]:
prompt = createPrompt(videos)
print(prompt)

In [None]:
prediction = classify(prompt)

In [None]:
print(prediction.text)

In [None]:
prompts = []
videos = []

for i in range(len(df)):
    title = df['title'][i]
    tags = df['tags'][i]
    transcript = df['cleaned_transcript'][i]
    
    if len(videos) >= 5:
        videos = []
        prompt = createPrompt(videos)
        prompts.append(prompt)
        
    videos.append(createVideoSnippet(title, topTags(tags), condenseTranscript(transcript, summary = True)))
    
prompt = createPrompt(videos)
prompts.append(prompt)

In [None]:
print(prompts[0])

In [None]:
predictions = []

for prompt in prompts:
    predictions.append(classify(prompt))

In [None]:
print(predictions[0].text)