### Transcribe Audio from Youtube into a .txt file

In [8]:
from pytube import YouTube
from pydub import AudioSegment
from openai import OpenAI
import os
import math
import re

client = OpenAI(api_key="")

# Download the YouTube video
video_url = "https://www.youtube.com/watch?v=T_Ak5xU3RAs"
youtube = YouTube(video_url)
audio_stream = youtube.streams.filter(only_audio=True).first()
audio_file = "audio.mp4"
audio_stream.download(filename=audio_file)

# Get the video title and replace blank spaces with underscores
video_title = youtube.title
output_file = re.sub(r'\s+', '_', video_title) + ".txt"

# Convert the audio to WAV format
audio = AudioSegment.from_file(audio_file, format="mp4")
wav_file = "audio.wav"
audio.export(wav_file, format="wav")

# Set the chunk size (in seconds)
chunk_size = 60 # 1 minute

# Calculate the number of chunks
total_duration = len(audio)
num_chunks = math.ceil(total_duration / (chunk_size * 1000))

print(f"Total duration: {total_duration} milliseconds")
print(f"Number of chunks: {num_chunks}")

# Create a list to store the transcriptions
transcriptions = []

# Process each chunk
for i in range(num_chunks):
    start_time = i * chunk_size * 1000
    end_time = min((i + 1) * chunk_size * 1000, total_duration)

    print(f"\nProcessing chunk {i+1}/{num_chunks}")
    print(f"Start time: {start_time} milliseconds")
    print(f"End time: {end_time} milliseconds")

    # Extract the chunk
    chunk = audio[start_time:end_time]

    # Export the chunk to a temporary WAV file
    chunk_file = f"chunk_{i}.wav"
    chunk.export(chunk_file, format="wav")

    print(f"Exported chunk to {chunk_file}")

    # Transcribe the chunk using OpenAI's Whisper API
    with open(chunk_file, "rb") as audio_file:
        print("Transcribing chunk...")
        transcription = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file,
            response_format="text"
        )
        transcriptions.append(transcription) # Append the transcription directly

    print("Transcription completed.")

    # Clean up the temporary chunk file
    os.remove(chunk_file)
    print(f"Temporary chunk file {chunk_file} removed.")

# Join the transcriptions
full_transcription = "\n".join(transcriptions)

# Save the full transcription to a .txt file with the video title
with open(output_file, "w") as file:
    file.write(full_transcription)

print(f"Full transcription saved to {output_file}")

# Print the full transcription
print("\nFull Transcription:")
print(full_transcription)

# Clean up the downloaded files
#os.remove(audio_file)
#print(f"Downloaded audio file {audio_file} removed.")
#os.remove(wav_file)
#print(f"Converted WAV file {wav_file} removed.")

Total duration: 2567709 milliseconds
Number of chunks: 43

Processing chunk 1/43
Start time: 0 milliseconds
End time: 60000 milliseconds
Exported chunk to chunk_0.wav
Transcribing chunk...
Transcription completed.
Temporary chunk file chunk_0.wav removed.

Processing chunk 2/43
Start time: 60000 milliseconds
End time: 120000 milliseconds
Exported chunk to chunk_1.wav
Transcribing chunk...
Transcription completed.
Temporary chunk file chunk_1.wav removed.

Processing chunk 3/43
Start time: 120000 milliseconds
End time: 180000 milliseconds
Exported chunk to chunk_2.wav
Transcribing chunk...
Transcription completed.
Temporary chunk file chunk_2.wav removed.

Processing chunk 4/43
Start time: 180000 milliseconds
End time: 240000 milliseconds
Exported chunk to chunk_3.wav
Transcribing chunk...
Transcription completed.
Temporary chunk file chunk_3.wav removed.

Processing chunk 5/43
Start time: 240000 milliseconds
End time: 300000 milliseconds
Exported chunk to chunk_4.wav
Transcribing chunk

### semantic Splitter with llama index and langchain

In [9]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Document
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage, AIMessage
import os
import pandas as pd
import re

os.environ["OPENAI_API_KEY"] = ""

embed_model = OpenAIEmbedding()

splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
)

# also baseline splitter
base_splitter = SentenceSplitter(chunk_size=512)

# Load documents
documents = SimpleDirectoryReader(input_files=[output_file]).load_data()

# Extract company name, quarter, and year from the title
title = documents[0].text.split("\n")[0]

# Extract company name (assuming it's the first part of the title before '_')
company_name_match = re.search(r'^(.*?)_', title)
company_name = company_name_match.group(1) if company_name_match else "Unknown"

# Extract quarter
quarter_match = re.search(r'(Q\d)', title)
quarter = quarter_match.group(1) if quarter_match else "Unknown"

# Extract year
year_match = re.search(r'(\d{4})', title)
year = year_match.group(1) if year_match else "Unknown"

# Split documents into nodes
nodes = splitter.get_nodes_from_documents(documents)

# Create an empty DataFrame to store the nodes, summaries, and categories
df = pd.DataFrame(columns=["Raw_Text", "Summary", "Category"])

# Initialize the LLM
llm = ChatOpenAI(model_name="gpt-4-turbo")

# Generate summary and category for each node in sequential order
for i, node in enumerate(nodes, start=1):
    summary_query = f"Could you summarize the following text? Return your response which covers the key points and does not miss anything important, please. No need to start with 'The text discusses', etc.\n\n{node.text}"
    summary_result = llm([HumanMessage(content=summary_query)])
    summary = summary_result.content

    category_query = f"Please provide a short category or topic for the following summary:\n\n{summary}"
    category_result = llm([HumanMessage(content=category_query)])
    category = category_result.content

    # Create a new DataFrame for the current node
    node_df = pd.DataFrame({"Raw_Text": [node.text], "Summary": [summary], "Category": [category]})

    # Append the node DataFrame to the main DataFrame
    df = pd.concat([df, node_df], ignore_index=True)

# Print the DataFrame
print(df)

# Add company name, quarter, and year columns to the DataFrame
df["Company"] = 'NFLX'
df["Quarter"] = quarter
df["Year"] = year

# Save the DataFrame to a CSV file
output_file = f"{company_name}_{quarter}_{year}.csv"
df.to_csv(output_file, index=False)
print(f"DataFrame saved to {output_file}")

from docx import Document
from docx.shared import Inches

# Create a new Word document
document = Document()

# Add a table to the document
table = document.add_table(rows=1, cols=len(df.columns))
table.style = 'Table Grid'

# Write the column headers
header_row = table.rows[0]
for i, column_name in enumerate(df.columns):
    header_row.cells[i].text = column_name

# Write the data rows
for _, row in df.iterrows():
    new_row = table.add_row()
    for i, value in enumerate(row):
        new_row.cells[i].text = str(value)

# Save the document
document.save(f"{company_name}_{quarter}_{year}.docx")

                                             Raw_Text  \
0   Good afternoon, and welcome to the Netflix Q3 ...   
1   So, let's start with you, Ted. Now that OneStr...   
2   You discussed at a recent...\n\nconference giv...   
3                                             Great.    
4   Let's move on to page sharing. Have you identi...   
5   I'll, I'll take that one. And I'll start by sa...   
6   quarters. I think also worth noting that that ...   
7   We're also excited about new products. So we'v...   
8   We've got a lot that we've got going on and we...   
9   Go ahead, Spence. All right. You wound it up f...   
10  Thanks, Jessica. So I would say just generally...   
11  And that, I think, is the first step of how we...   
12  But all that said, we'll continue to drive hea...   
13  That's really the strength. And I do think tha...   
14  I imagine that momentum will continue. So your...   
15  at the same time. And the other fun part of it...   
16  right in our talent family.

In [11]:
# Perform tone change detection
df1 = pd.read_csv('NFLX_Q3_2023.csv')
df2 = pd.read_csv('NFLX_Q4_2023.csv')

for category in pd.concat([df1['Category'], df2['Category']]).unique():
    category_df1 = df1[df1['Category'] == category]
    category_df2 = df2[df2['Category'] == category]
    
    if not category_df1.empty and not category_df2.empty:
        summaries1 = category_df1['Summary'].tolist()
        summaries2 = category_df2['Summary'].tolist()
        
        for summary1 in summaries1:
            for summary2 in summaries2:
                tone_query = f"Please analyze the following two summaries and determine if there is a significant change in tone between them:\n\nSummary 1: {summary1}\n\nSummary 2: {summary2}\n\nIs there a significant change in tone? If yes, please describe the change."
                tone_result = llm([HumanMessage(content=tone_query)])
                tone_change = tone_result.content
                
                if "significant change" in tone_change.lower():
                    print(f"Significant tone change detected between summaries in category '{category}':")
                    print(f"Summary 1: {summary1}")
                    print(f"Summary 2: {summary2}")
                    print(f"Tone change: {tone_change}")
                    print("---")

# Concatenate the two DataFrames
combined_df = pd.concat([df1, df2])

# Save the combined DataFrame to a CSV file
output_file = f"{company_name}_{quarter}_{year}.csv"
combined_df.to_csv(output_file, index=False)

print(f"Combined DataFrame saved to {output_file}")

# Create a new Word document
document = Document()

# Add a table to the document
table = document.add_table(rows=1, cols=len(combined_df.columns))
table.style = 'Table Grid'

# Write the column headers
header_row = table.rows[0]
for i, column_name in enumerate(combined_df.columns):
    header_row.cells[i].text = column_name

# Write the data rows
for _, row in combined_df.iterrows():
    new_row = table.add_row()
    for i, value in enumerate(row):
        new_row.cells[i].text = str(value)

# Save the document
document.save("comps.docx")

Combined DataFrame saved to Unknown_Q3_2023.csv


In [14]:
import os
os.environ['OPENAI_API_KEY'] = ''

import faiss
import pandas as pd
from llama_index.core import (
    Document,
    SimpleDirectoryReader,
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.vector_stores.faiss import FaissVectorStore

# Perform tone change detection
df1 = pd.read_csv('NFLX_Q3_2023.csv')
df2 = pd.read_csv('NFLX_Q4_2023.csv')

# dimensions of text-ada-embedding-002
d = 1536
faiss_index = faiss.IndexFlatL2(d)

dataframes = [df1, df2]

# Convert dataframe rows to Document objects
documents = []
for idx, df in enumerate(dataframes):
    for _, row in df.iterrows():
        doc_text = ' '.join(str(val) for val in row.values)
        document = Document(text=doc_text, doc_id=f"doc_{idx}")
        documents.append(document)

vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

index.storage_context.persist()

vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
index = load_index_from_storage(storage_context=storage_context)

query_engine = index.as_query_engine()

for _, row in df1.iterrows():
    category1 = row['Category']
    
    # Find similar categories in df2
    similar_categories_query = f"Find categories in the second dataframe that are similar to the following category: {category1}"
    similar_categories_result = query_engine.query(similar_categories_query)
    similar_categories = similar_categories_result.response.split("\n")
    
    for category2 in similar_categories:
        category_df2 = df2[df2['Category'] == category2]
        
        if not category_df2.empty:
            summaries1 = df1[df1['Category'] == category1]['Summary'].tolist()
            summaries2 = category_df2['Summary'].tolist()
            
            for summary1 in summaries1:
                for summary2 in summaries2:
                    tone_query = f"Please analyze the following two summaries and determine if there is a significant change in tone between them:\n\nSummary 1: {summary1}\n\nSummary 2: {summary2}\n\nIs there a significant change in tone? If yes, please describe the change."
                    tone_result = query_engine.query(tone_query)
                    tone_change = tone_result.response
                    
                    if "significant change" in tone_change.lower():
                        print(f"Significant tone change detected between summaries in similar categories:")
                        print(f"Category 1: {category1}")
                        print(f"Category 2: {category2}")
                        print(f"Summary 1: {summary1}")
                        print(f"Summary 2: {summary2}")
                        print(f"Tone change: {tone_change}")
                        print("---")

Significant tone change detected between summaries in similar categories:
Category 1: Corporate Strategy and Performance Update
Category 2: Business Strategy Focus: Core Growth vs. Diversification
Summary 1: The speaker is very satisfied with the company's recent performance, highlighted by membership growth in Q2 and positive revenue projections for Q4. A key focus has been managing the challenge of paid sharing, balancing consumer needs with ensuring fair compensation for the company's entertainment services. This involved leveraging over a decade of experience in developing product experiences and solving problems iteratively based on consumer feedback. The implementation of the paid sharing model is being carefully phased to manage consumer expectations. The rollout is staged based on technical capabilities, which improve over time, and borrower behavior to optimize the timing of product experiences for better conversion rates. The company plans to continue this approach over the n