In [1]:
import pandas as pd
import numpy as np

import openai
from gensim.summarization import summarize

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import os

In [2]:
df = pd.read_csv('../data/seed/youtube/seed_videos_1.csv')

In [11]:
df[df['cleaned_transcript'].isna()]

Unnamed: 0,id,title,description,tags,cleaned_transcript,raw_transcript,comments,comment_ids,label
15,VJgHkAqohbU,Stock Options Explained,"Correction: At 4:20, the graph in the top left...","['The Plain bagel', 'Stock Options', 'Call Opt...",,{},['Happy Friday everyone! \n\nHave you ever use...,"['UgzbCGpzmxw-EjhoMJJ4AaABAg', 'UgzbCGpzmxw-Ej...",traditional
16,ZCFkWDdmXG8,Explained | The Stock Market | FULL EPISODE | ...,"In partnership with Vox Media Studios and Vox,...","['education', 'netflix education', 'documentar...",,{},['A lot of folks have been going on about a Ja...,"['UgwYEeybOLKxrHtKoWN4AaABAg', 'UgwYEeybOLKxrH...",traditional
42,yubzJw0uiE4,,,[],,{},[],[],blockchain
46,NNQLJcJEzv0,,,[],,{},[],[],blockchain
53,1YyAzVmP9xQ,,,[],,{},[],[],blockchain


In [5]:
def applyTfidf(corpus, n):
    """Apply TF-IDF"""
    # Should I just make one of these?
    # Convert the text into a sparse matrix using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([corpus])
    
    # Get the feature names and scores
    feature_names = vectorizer.get_feature_names_out()
    scores = dict(zip(feature_names, tfidf.data))
    
    # Sort the scores in descending order
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # Select only the top N features
    selected_features = [x[0] for x in sorted_scores[:n]]
    
    return selected_features

In [6]:
def topTags(tags, n = 10):
    """Apply TF-IDF to the tags to get the ten most relevant tags"""
    corpus = " ".join([tag.strip().strip("\'") for tag in tags[1:-1].strip('[').strip(']').split(',')])
    
    selected_features = applyTfidf(corpus, n)
    
    return " ".join(selected_features)

In [17]:
def condenseTranscript(transcript, summary = False, tfidf = False, word_count = 100, n = 50):
    """Cleans and condenses transcript"""
    print(transcript)
    corpus = transcript.replace("\n", " ").replace(" - ", "").replace('- ', "").replace("\'", "").replace(".", ". ")
    if summary:
        if len(corpus) < word_count:
            return corpus
        elif len(corpus.split(".")) <= 1:
            return corpus
        return summarize(corpus, word_count = word_count)
    elif tfidf:
        if len(corpus) < word_count:
            return corpus
        elif len(corpus.split(".")) <= 1:
            return corpus
        return " ".join(applyTfidf(corpus, n)) + "."
    else:
        return corpus

In [18]:
def createVideoSnippet(title, transcript, tags):
    """Creates the video snippet"""
    return title + '. ' + transcript.replace("\n", " ") + ' ' + tags + "."

In [19]:
def createPrompt(videos):
    """Create prompt given video snippet list"""
#     prompt="These labels determine if a video is discussing or recommending the following investments:\nAlternative: Cryptocurrency, Blockchain, NFTs\nTraditional: Stocks, Bonds, Real Estate, Commodities\nMixed: Discusses at least one of each topic from alternative and traditional labels defined above.\nNone: Not related to investing or finance\n\nClassify these YouTube video snippets, with each snippet containing the title, transcript, and video tags.\n\nFor each video return probabilities for all of the labels and explain the most probable label.\n\nExample Output:\nAlternative: 0.15 Traditional 0.15 Mixed .1 None .6 None because lorem ipsum."
#     prompt="These labels determine if a video is discussing or recommending the following investments:\nAlternative: Cryptocurrency, Blockchain, NFTs\nTraditional: Stocks, Bonds, Real Estate, Commodities\nMixed: Must at least one of each topic from alternative and traditional labels defined above.\nNone: Doesn't discuss the topics above or related topics.\n\nFor each video return probabilities for all of the labels and explain the most probable label.\nExample Output:\nAlternative: 0.15 Traditional 0.15 Mixed .1 None .6 None because lorem ipsum.\n\nClassify each YouTube video snippet below, with each snippet containing the title, transcript, and video tags."
    prompt="These labels determine if a video is discussing or recommending the following investments:\n1. Alternative: Cryptocurrency, Blockchain, NFTs\n2. Traditional: Stocks, Bonds, Real Estate, Commodities\n3. Mixed: Discusses at least one of each topic from alternative and traditional labels\n4. None: Not related to investing or finance\n\nClassify these YouTube video snippets, with each snippet containing the title, transcript, tags."
    for i in range(len(videos)):
        prompt += '\n\n{} '.format(i+1) + videos[i]
    prompt += "\n\nFor each video return probabilities for all of the labels and explain the most probable label.\nExample Output:\nAlternative: 0.15 Traditional: 0.15 Mixed: .1 None: .6 None because lorem ipsum"
    
    return prompt

In [20]:
# Initialize the API client
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [21]:
# Function to make classification requests
def classify(prompt, model_engine = "text-davinci-002", max_tokens = 1024, n = 1, temperature = 0.5):
    completions = openai.Completion.create(
        engine=model_engine,
        prompt=prompt,
        max_tokens=max_tokens,
        n=n,
        stop=None,
        temperature=temperature,
    )
    return completions.choices[0]

In [22]:
df['cleaned_transcript']

0      Right, so let's say\nyou want to get started. ...
1      - Investing, when most people think of it,. th...
2      If you're saving for retirement, IRAs, also\nk...
3      One of the most common ways people save for\nr...
4      are you one of the 60 million americans. activ...
                             ...                        
96     bitcoin is worthless artificial gold. which if...
97     well bitcoin as a the computer science. behind...
98     - [Narrator] These are just\na few of the tech...
99     There are many problems in\nthe modern housing...
100    this video was made in partnership with. the d...
Name: cleaned_transcript, Length: 101, dtype: object

In [23]:
df['video_summary'] = df['cleaned_transcript'].apply(lambda x: condenseTranscript(x, summary = True))

Right, so let's say
you want to get started. with this investing thing.. You might have a bit of money saved.. It's probably not enough for a house,. but you reckon I should probably
invest this in something.. Maybe you've heard on the news about. Tesla or Netflix or Amazon and how,. if you'd invested 10 years ago in Tesla. then you'd be a millionaire
by now or things like that.. But if you're new to the game,. this whole investment thing. can seem like a really
complicated black box.. Like, how do you even buy a stock?. What even is a stock?. Do you just go on tesla.com
and buy some Tesla,. like, how does it work? (chuckles). And if you try and look into this,. you get all these acronyms
being thrown around. like Roth IRAs and 401Ks in America. or like ISAs or LISAs in the UK.. And on top of that, there is the anxiety. that we all have that I
know investing is risky. and I don't want to
lose all that my money.. So in light of all of that,
this is the ultimate guide. on how to get star

AttributeError: 'float' object has no attribute 'replace'

In [None]:
videos = []

In [None]:
videos.append(createVideoSnippet(df['title'][0], condenseTranscript(df['cleaned_transcript'][0], summary = True), topTags(df['tags'][0])))
# videos.append(createVideoSnippet(df['title'][0], condenseTranscript(df['cleaned_transcript'][0], tfidf = True), topTags(df['tags'][0])))

In [None]:
prompt = createPrompt(videos)
print(prompt)

In [None]:
prediction = classify(prompt)

In [None]:
print(prediction.text)

In [None]:
prompts = []
videos = []

for i in range(len(df)):
    title = df['title'][i]
    tags = df['tags'][i]
    transcript = df['cleaned_transcript'][i]
    
    if len(videos) >= 5:
        videos = []
        prompt = createPrompt(videos)
        prompts.append(prompt)
        
    videos.append(createVideoSnippet(title, topTags(tags), condenseTranscript(transcript, summary = True)))
    
prompt = createPrompt(videos)
prompts.append(prompt)

In [None]:
print(prompts[0])

In [None]:
predictions = []

for prompt in prompts:
    predictions.append(classify(prompt))

In [None]:
print(predictions[0].text)