In [2]:
import pandas as pd
import numpy as np

import openai
from gensim.summarization import summarize

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import os

In [4]:
df = pd.read_csv('../data/seed/youtube/test_videos_3.csv')

In [5]:
df.head()

Unnamed: 0,id,title,description,tags,cleaned_transcript,raw_transcript,comments,comment_ids,label
0,WFI6Bg-zmv4,How TikTok Could Become a U.S. Company | WSJ,"TikTok is at a crossroads, as U.S. concerns ab...","['tik tok china', 'tiktok china', 'tiktok owne...",- [Narrator] TikTok is at a crossroads.. Conce...,[{'text': '- [Narrator] TikTok is at a crossro...,['You can complete you steal that the America ...,"['Ugzx8SnYK28ySk4faHx4AaABAg', 'Ugyay5JsgkocDM...",none
1,dQw4w9WgXcQ,Rick Astley - Never Gonna Give You Up (Officia...,The official video for “Never Gonna Give You U...,"['rick astley', 'Never Gonna Give You Up', 'ng...",[Music]. you know the rules. [Music]. gotta ma...,"[{'text': '[Music]', 'start': 0.0, 'duration':...",['1 BILLION views for Never Gonna Give You Up!...,"['UgzarqjaaPC7TbFINNx4AaABAg', 'UgzarqjaaPC7Tb...",none
2,EKIhc3tS0os,The Millionaire Investing Advice For Teenagers,For anyone who wanted a video about how to inv...,"['how to invest', 'how to invest in stocks', '...",what's up you guys it's Graham here so I. have...,"[{'text': ""what's up you guys it's Graham here...","[""I only went to college because I needed a de...","['UgzmSWPHqXz1p7jbNuZ4AaABAg', 'Ugwu-PbdD_35Vy...",traditional
3,-Hbu2nKVJR0,How To Invest In Cryptocurrency For Beginners ...,Here is the beginners guide to investing in cr...,"['investing', 'investing for beginners', 'inve...",what's up graham it's guys here so we. gotta h...,"[{'text': ""what's up graham it's guys here so ...",['You know you could work for 40years to save ...,"['Ugw6yqHUHfqRhoTnLp54AaABAg', 'Ugxq9FRLKcKEPt...",blockchain
4,d4h8dYSgHMw,Revealing My Entire $500K Investment Portfolio,Revealing What's in my Entire Portfolio includ...,"['humphreytalks', 'how to invest', 'passive in...",the market just had its worst calendar. year s...,[{'text': 'the market just had its worst calen...,['👑Don’t forget to get 6-12 FREE Stocks when y...,"['Ugyitj1ZF9aWrKtmCK94AaABAg', 'Ugyitj1ZF9aWrK...",mixed


In [289]:
def applyTfidf(corpus, n):
    """Apply TF-IDF"""
    # Should I just make one of these?
    # Convert the text into a sparse matrix using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform([corpus])
    
    # Get the feature names and scores
    feature_names = vectorizer.get_feature_names_out()
    scores = dict(zip(feature_names, tfidf.data))
    
    # Sort the scores in descending order
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # Select only the top N features
    selected_features = [x[0] for x in sorted_scores[:n]]
    
    return selected_features

In [290]:
def topTags(tags, n = 10):
    """Apply TF-IDF to the tags to get the ten most relevant tags"""
    corpus = " ".join([tag.strip().strip("\'") for tag in tags[1:-1].strip('[').strip(']').split(',')])
    
    selected_features = applyTfidf(corpus, n)
    
    return " ".join(selected_features)

In [291]:
def condenseTranscript(transcript, summary = False, tfidf = False, word_count = 100, n = 50):
    """Cleans and condenses transcript"""
    corpus = transcript.replace("\n", " ").replace(" - ", "").replace('- ', "").replace("\'", "").replace(".", ". ")
    if summary:
        if len(corpus) < word_count:
            return corpus
        elif len(corpus.split(".")) <= 1:
            return corpus
        return summarize(corpus, word_count = word_count)
    elif tfidf:
        if len(corpus) < word_count:
            return corpus
        elif len(corpus.split(".")) <= 1:
            return corpus
        return " ".join(applyTfidf(corpus, n)) + "."
    else:
        return corpus

In [431]:
def createVideoSnippet(title, transcript, tags):
    """Creates the video snippet"""
    return title + '. ' + transcript.replace("\n", " ") + ' ' + tags + "."

In [432]:
def createPrompt(videos):
    """Create prompt given video snippet list"""
#     prompt="These labels determine if a video is discussing or recommending the following investments:\nAlternative: Cryptocurrency, Blockchain, NFTs\nTraditional: Stocks, Bonds, Real Estate, Commodities\nMixed: Discusses at least one of each topic from alternative and traditional labels defined above.\nNone: Not related to investing or finance\n\nClassify these YouTube video snippets, with each snippet containing the title, transcript, and video tags.\n\nFor each video return probabilities for all of the labels and explain the most probable label.\n\nExample Output:\nAlternative: 0.15 Traditional 0.15 Mixed .1 None .6 None because lorem ipsum."
#     prompt="These labels determine if a video is discussing or recommending the following investments:\nAlternative: Cryptocurrency, Blockchain, NFTs\nTraditional: Stocks, Bonds, Real Estate, Commodities\nMixed: Must at least one of each topic from alternative and traditional labels defined above.\nNone: Doesn't discuss the topics above or related topics.\n\nFor each video return probabilities for all of the labels and explain the most probable label.\nExample Output:\nAlternative: 0.15 Traditional 0.15 Mixed .1 None .6 None because lorem ipsum.\n\nClassify each YouTube video snippet below, with each snippet containing the title, transcript, and video tags."
    prompt="These labels determine if a video is discussing or recommending the following investments:\n1. Alternative: Cryptocurrency, Blockchain, NFTs\n2. Traditional: Stocks, Bonds, Real Estate, Commodities\n3. Mixed: Discusses at least one of each topic from alternative and traditional labels\n4. None: Not related to investing or finance\n\nClassify these YouTube video snippets, with each snippet containing the title, transcript, tags."
    for i in range(len(videos)):
        prompt += '\n\n{} '.format(i+1) + videos[i]
    prompt += "\n\nFor each video return probabilities for all of the labels and explain the most probable label.\nExample Output:\nAlternative: 0.15 Traditional: 0.15 Mixed: .1 None: .6 None because lorem ipsum"
    
    return prompt

In [433]:
# Initialize the API client
openai.api_key = os.environ.get("OPEN_AI_KEY")

In [434]:
# Function to make classification requests
def classify(prompt, model_engine = "text-davinci-002", max_tokens = 1024, n = 1, temperature = 0.5):
    completions = openai.Completion.create(
        engine=model_engine,
        prompt=prompt,
        max_tokens=max_tokens,
        n=n,
        stop=None,
        temperature=temperature,
    )
    return completions.choices[0]

In [435]:
videos = []

In [436]:
videos.append(createVideoSnippet(df['title'][0], condenseTranscript(df['cleaned_transcript'][0], summary = True), topTags(df['tags'][0])))
# videos.append(createVideoSnippet(df['title'][0], condenseTranscript(df['cleaned_transcript'][0], tfidf = True), topTags(df['tags'][0])))

In [437]:
prompt = createPrompt(videos)
print(prompt)

These labels determine if a video is discussing or recommending the following investments:
1. Alternative: Cryptocurrency, Blockchain, NFTs
2. Traditional: Stocks, Bonds, Real Estate, Commodities
3. Mixed: Discusses at least one of each topic from alternative and traditional labels
4. None: Not related to investing or finance

Classify these YouTube video snippets, with each snippet containing the title, transcript, tags.

1 How TikTok Could Become a U.S. Company | WSJ. The first concern is that the Chinese government could order TikTok to hand over data about individual American users. The second big thing is the video recommendation algorithm, and the idea is that the Chinese government could order TikTok to control what videos Americans could watch. [Narrator] A divestiture order is what happened to the dating app Grindr in 2019, after US officials determined that the apps personal data could potentially be exploited for blackmail, since the majority stakeholder was a Chinese compan

In [426]:
prediction = classify(prompt)

In [427]:
print(prediction.text)



1. Alternative: 0.15 Traditional: 0.15 Mixed: .1 None: .6

The most probable label for this video is "None" because it is not related to investing or finance.


In [428]:
prompts = []
videos = []

for i in range(len(df)):
    title = df['title'][i]
    tags = df['tags'][i]
    transcript = df['cleaned_transcript'][i]
    
    if len(videos) >= 5:
        videos = []
        prompt = createPrompt(videos)
        prompts.append(prompt)
        
    videos.append(createVideoSnippet(title, topTags(tags), condenseTranscript(transcript, summary = True)))
    
prompt = createPrompt(videos)
prompts.append(prompt)

In [429]:
print(prompts[0])

These labels determine if a video is discussing or recommending the following investments:
1. Alternative: Cryptocurrency, Blockchain, NFTs
2. Traditional: Stocks, Bonds, Real Estate, Commodities
3. Mixed: Discusses at least one of each topic from alternative and traditional labels
4. None: Not related to investing or finance

Classify these YouTube video snippets, with each snippet containing the title, transcript, tags.

1 How TikTok Could Become a U.S. Company | WSJ. tiktok tok media ownership deal bytedance data social divestiture in The first concern is that the Chinese government could order TikTok to hand over data about individual American users.
The second big thing is the video recommendation algorithm, and the idea is that the Chinese government could order TikTok to control what videos Americans could watch.
[Narrator] A divestiture order is what happened to the dating app Grindr in 2019, after US officials determined that the apps personal data could potentially be exploit

In [376]:
predictions = []

for prompt in prompts:
    predictions.append(classify(prompt))

In [377]:
print(predictions[0].text)



1. Alternative: 0.15 Traditional: 0.15 Mixed: .1 None: .6
The most probable label for this video is "None" because it does not discuss any investments.

2. Alternative: 0.2 Traditional: 0.1 Mixed: 0.3 None: 0.4
The most probable label for this video is "None" because it does not discuss any investments.
