In [74]:
import pandas as pd
import numpy as np

import openai
from gensim.summarization import summarize

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import os

In [75]:
df = pd.read_csv('../data/seed/youtube/seed_videos_7.csv')

In [76]:
df = df.dropna(subset = ['cleaned_transcript']).reset_index(drop = True)

In [78]:
df = df.fillna("")

In [79]:
def applyTfidf(corpus, n):
    """Apply TF-IDF"""
    # Should I just make one of these?
    # Convert the text into a sparse matrix using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([corpus])
    
    # Get the feature names and scores
    feature_names = vectorizer.get_feature_names_out()
    scores = dict(zip(feature_names, tfidf.data))
    
    # Sort the scores in descending order
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # Select only the top N features
    selected_features = [x[0] for x in sorted_scores[:n]]
    
    return selected_features

In [88]:
def topTags(tags, n = 10):
    """Apply TF-IDF to the tags to get the ten most relevant tags"""
    corpus = " ".join([tag.strip().strip("\'") for tag in tags[1:-1].strip('[').strip(']').split(',')])
    
    if len(corpus) == 0:
        return ""
    
    selected_features = applyTfidf(corpus, n)
    
    return " ".join(selected_features)

In [89]:
def condenseTranscript(transcript, summary = False, tfidf = False, word_count = 250, n = 50):
    """Cleans and condenses transcript"""
    corpus = transcript.replace("\n", " ").replace(" - ", "").replace('- ', "").replace("\'", "").replace(".", ". ")
    if summary:
        if len(corpus) < word_count:
            return corpus
        elif len(corpus.split(".")) <= 1:
            return corpus
        return summarize(corpus, word_count = word_count)
    elif tfidf:
        if len(corpus) < word_count:
            return corpus
        elif len(corpus.split(".")) <= 1:
            return corpus
        return " ".join(applyTfidf(corpus, n)) + "."
    else:
        return corpus

In [90]:
def createVideoSnippet(title, transcript, tags):
    """Creates the video snippet"""
    return title + '. ' + transcript.replace("\n", " ") + ' ' + tags + "."

In [91]:
def createPrompt(videos):
    """Create prompt given video snippet list"""
#     prompt="These labels determine if a video is discussing or recommending the following investments:\nAlternative: Cryptocurrency, Blockchain, NFTs\nTraditional: Stocks, Bonds, Real Estate, Commodities\nMixed: Discusses at least one of each topic from alternative and traditional labels defined above.\nNone: Not related to investing or finance\n\nClassify these YouTube video snippets, with each snippet containing the title, transcript, and video tags.\n\nFor each video return probabilities for all of the labels and explain the most probable label.\n\nExample Output:\nAlternative: 0.15 Traditional 0.15 Mixed .1 None .6 None because lorem ipsum."
#     prompt="These labels determine if a video is discussing or recommending the following investments:\nAlternative: Cryptocurrency, Blockchain, NFTs\nTraditional: Stocks, Bonds, Real Estate, Commodities\nMixed: Must at least one of each topic from alternative and traditional labels defined above.\nNone: Doesn't discuss the topics above or related topics.\n\nFor each video return probabilities for all of the labels and explain the most probable label.\nExample Output:\nAlternative: 0.15 Traditional 0.15 Mixed .1 None .6 None because lorem ipsum.\n\nClassify each YouTube video snippet below, with each snippet containing the title, transcript, and video tags."
    prompt="These labels determine if a video is discussing or recommending the following investments:\n1. Alternative: Cryptocurrency, Blockchain, NFTs\n2. Traditional: Stocks, Bonds, Real Estate, Commodities\n3. Mixed: Discusses at least one of each topic from alternative and traditional labels\n4. None: Not related to investing or finance\n\nClassify these YouTube video snippets, with each snippet containing the title, transcript, tags."
    for i in range(len(videos)):
        prompt += '\n\n{} '.format(i+1) + videos[i]
    prompt += "\n\nFor each video return probabilities for all of the labels and explain the most probable label.\nExample Output:\nAlternative: 0.15 Traditional: 0.15 Mixed: .1 None: .6 None because lorem ipsum"
    
    return prompt

In [92]:
# Initialize the API client
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [93]:
# Function to make classification requests
def classify(prompt, model_engine = "text-davinci-002", max_tokens = 1024, n = 1, temperature = 0.5):
    completions = openai.Completion.create(
        engine=model_engine,
        prompt=prompt,
        max_tokens=max_tokens,
        n=n,
        stop=None,
        temperature=temperature,
    )
    return completions.choices[0]

In [96]:
df['snippet'] = df.apply(lambda x: createVideoSnippet(x.title, condenseTranscript(x.cleaned_transcript, summary = True), topTags(x.tags)), axis = 1)

In [108]:
df['cleaned_transcript'][1]

'- Investing, when most people think of it,. they imagine day trading, stock tickers,. and people screaming at each other.. But apart from what you may have heard,. investing doesn\'t need to be complicated,. overwhelming, or even boring.. The basics are actually quite simple.. And once you get them down,. you could make millions of\ndollars in your lifetime.. With the help of my friend\nand personal finance expert,. Ramit Sethi, we\'re gonna\nditch the generalities. and get into the specifics\ncovering how to start investing,. what accounts you should open up,. and all the things that you\nshould and shouldn\'t be doing. when it comes to investing.. - This is a critical mistake people make.. And it will cost you hundreds\nof thousands of dollars.. - This video is sponsored by Squarespace.. More on them later.. My early twenties weren\'t exactly. my most productive financial years.. I remember counting\npennies in line at Wendy\'s. to get their $1 chicken sandwich.. And if you\'re in a

In [109]:
df['snippet'][1]

'I asked a personal finance expert how to invest.. Investing, when most people think of it,. investing doesnt need to be complicated,. and get into the specifics covering how to start investing,. to even start thinking about investing,. .  Before you open up your first investment account,. to start investing in the stock market. you need to start investing as soon as you possibly can. .  Lets say you invest $5,000 a year from age 25 to 65. I knew that I wanted to start investing,. from the minute that you decide to start investing. that you should start investing in,. Well, you actually need to invest your money,. think about investing in the stock market,. "Ramit, why would I invest in the stock market?. ".  If youre getting 60% returns per year,. . So, instead of investing your money into one company,. you need to diversify by investing. .  While there is always a risk when it comes to investing,. .  You simply pick the year that youre gonna retire. .  Basically, the year youre gonna

In [None]:
prompt = createPrompt(videos)
print(prompt)

In [None]:
prediction = classify(prompt)

In [None]:
print(prediction.text)

In [None]:
prompts = []
videos = []

for i in range(len(df)):
    title = df['title'][i]
    tags = df['tags'][i]
    transcript = df['cleaned_transcript'][i]
    
    if len(videos) >= 5:
        videos = []
        prompt = createPrompt(videos)
        prompts.append(prompt)
        
    videos.append(createVideoSnippet(title, topTags(tags), condenseTranscript(transcript, summary = True)))
    
prompt = createPrompt(videos)
prompts.append(prompt)

In [None]:
print(prompts[0])

In [None]:
predictions = []

for prompt in prompts:
    predictions.append(classify(prompt))

In [None]:
print(predictions[0].text)