# Youtube

## List all videos from a Youtube channel, and get the transcript

In [None]:
import os
import json
import scrapetube
from youtube_transcript_api import YouTubeTranscriptApi
from pprint import pprint

s4_yt_url = "https://www.youtube.com/@S4Events"
realpars_yt_url = "https://www.youtube.com/@realpars"
ics_village_yt_channel = "https://www.youtube.com/@ICSVillage"
sans_ics_yt_channel = "https://www.youtube.com/@SANSICSSecurity"

channel_videos = scrapetube.get_channel(channel_url=realpars_yt_url)
videos = []
for channel_video in channel_videos:
    #pprint(channel_video)
    video = {}
    video['url'] = 'http://www.youtube.com/watch?v=' + channel_video['videoId']
    #print(video['url'])
    video['title'] = channel_video['title']['runs'][0]['text']
    print(video['title'])
    tr_nb = 0
    try:
        video['transcript'] = ''
        transcript = YouTubeTranscriptApi.get_transcript(channel_video['videoId'])
        #print('transcript found!')
        #print(transcript)
        tr_nb += 1
        for text in transcript:
            video['transcript'] = video['transcript'] + text['text']
        #print(video['transcript'])
        videos.append(video)
    except:
        next

#print(tr_nb)
#pprint(len(videos))

# Transform the list into a Dataset 
import pandas as pd
df = pd.DataFrame(videos, columns =['title', 'url', 'transcript'])
df


## Put that into Pinecone

In [None]:
# Copied from https://docs.pinecone.io/docs/langchain

import tiktoken
import pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter

OPENAI_KEY = "XXXX"


tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    #model=model_name,
    openai_api_key=OPENAI_KEY,
)
print(embed)

import pinecone

index_name = 'otsecgpt'
pinecone.init(
    api_key='YYYYYY',
    environment='gcp-starter'
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )

active_indexes = pinecone.list_indexes()
print(active_indexes)


from tqdm.auto import tqdm
from uuid import uuid4

# Checking the data
df

index = pinecone.Index("otsecgpt")

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(videos)):
    print(record)
    # first get metadata fields for this record
    metadata = {
        'url': str(record['url']),
        'title': record['title'],
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['transcript'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

index.describe_index_stats()


# Parsing the audio from my elearning videos

## Extracting the audio files

In [None]:
import sys
from moviepy.editor import *

# Folder containing the videos
input_folder = 'C:\\Users\\soull\\Downloads\\elearning_videos'
# Output folder
output_folder = 'C:\\Users\\soull\\Downloads\\elearning_audio'

# List the files
for videofile in os.listdir(input_folder):
    print(videofile)
    video = VideoFileClip(input_folder + '\\' + videofile)
    audio = video.audio
    audio.write_audiofile(output_folder + '\\' + videofile + '.mp3')

## Getting the transcript through AWS

### Upload to S3

In [None]:
# Credentials
access_key = 'ZZZZZ'
secret_key = 'ZZZZZ'

import sys
import boto3

s3_resource = boto3.resource('s3')
bucket = s3_resource.Bucket(name='XXX_bucket_XXX')

for audiofile in os.listdir(output_folder):
    bucket.upload_file('C:\\Users\\soull\\Downloads\\elearning_audio\\' + audiofile, Key='input/' + audiofile)
    print('File ' + audiofile + ' uploaded')


### Launch the transcription job using AWS Transcribe

In [None]:
# Code from https://gist.github.com/viethoangtranduong/28a365e6457f35e206779995f488318a & https://towardsdatascience.com/a-quick-tutorial-to-aws-transcribe-with-python-53bbf6605a55

import uuid
transcribe = boto3.client('transcribe',
aws_access_key_id = access_key,
aws_secret_access_key = secret_key,
region_name = "us-east-1")

# Get files from the s3 bucket
for objects in bucket.objects.filter(Prefix="input/"):
    if (objects.key != 'input/'):
        id = str(uuid.uuid1())
        print(objects.key)
        response = transcribe.start_transcription_job(
            TranscriptionJobName=id,
            LanguageCode='en-US',
            MediaFormat='mp3',
            Media={
                'MediaFileUri': 's3://XXX_bucket_XXX/' + objects.key},
            OutputBucketName='XXX_bucket_XXX',
            OutputKey='output/' + str(objects.key).replace('input/', '').replace(' ', '_'),
            )
        print(str(objects.key).replace('input/', '').replace(' ', '_'))
        print(response)



### Get the results

In [None]:
transcripts = []
# Credentials
access_key = 'XXXXX'
secret_key = 'XXXXX'

import sys
import boto3

s3_resource = boto3.resource('s3')
bucket = s3_resource.Bucket(name='XXX_bucket_XXX')

for objects in bucket.objects.filter(Prefix="output/"):
    if('.mp3'in str(objects.key)):
        print(objects.key)
        obj = s3_resource.Object('XXX_bucket_XXX', objects.key)
        file_content = obj.get()['Body'].read().decode('utf-8')
        #pprint(file_content)
        transcripts.append(file_content)
print(len(transcripts))


In [None]:
transcripts
test = json.loads(transcripts[0])
texte = test['results']['transcripts'][0]['transcript'])

## Connect to Pinecone

In [None]:
# Copied from https://docs.pinecone.io/docs/langchain

import tiktoken
import pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter

OPENAI_KEY = "YYYYY"


tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    #model=model_name,
    openai_api_key=OPENAI_KEY,
)
print(embed)

import pinecone

index_name = 'otsecgpt'
pinecone.init(
    api_key='YYYY',
    environment='gcp-starter'
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1536  # 1536 dim of text-embedding-ada-002
    )

active_indexes = pinecone.list_indexes()
print(active_indexes)

## Indexing my videos 

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

# Checking the data


index = pinecone.Index("otsecgpt")

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(transcripts)):
    print(record)
    test = json.loads(transcripts[i])
    texte = test['results']['transcripts'][0]['transcript']
    print(texte)
    # first get metadata fields for this record

    # now we create chunks from the record text
    record_texts = text_splitter.split_text(texte)
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

index.describe_index_stats()

## Indexing via the pinecone client

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

# Checking the data
df

index = pinecone.Index("otsecgpt")

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(videos)):
    print(record)
    # first get metadata fields for this record
    metadata = {
        'url': str(record['url']),
        'title': record['title'],
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['transcript'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

index.describe_index_stats()

## Index the content of some Kindle books
*DRM need to be removed firtst: https://www.cloudwards.net/remove-drm-from-kindle-books/*

I also did a little bit a manual clean up after exporting the books to txt file

In [None]:
# Get the content of the folder
import os
path = 'C:\\Users\\soull\\Desktop\\books'
#path = 'C:\\Users\\soull\\Desktop\\books\\test'
books_content  = []
for bookfile in os.listdir(path):
    book = {}
    if(bookfile[-4:] == '.txt'):
       book['title'] = bookfile[:-4]
       print('Title: ' + book['title'])
       with open(path + '\\' + bookfile,'r', encoding='utf8') as f:
        contents = f.read()
        book['text'] = contents
        books_content.append(book)
print('==> ' + str(len(books_content)) + ' books parsed')

## Process into chunks of text and ingest into Pinecone

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4
import pprint
import time

index = pinecone.Index("otsecgpt")

batch_limit = 32

texts = []
metadatas = []

for i, record in enumerate(books_content[4:]):
    print(i)
    print(record)
    # first get metadata fields for this record
    metadata = {
        'title': record['title'],
        'type': 'book'
    }
    print(record['title'])
    #print(record['text'])
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    print('Nb de chunks ' + str(len(record_texts)))
    #print(record_texts)
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    #print(record_metadatas)
    #print(len(record_metadatas[0]))
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        print("Too long; we're in the loop")
        for k in tqdm(range(0, len(texts), batch_limit)):
            #print(str(k)+ ' -> ' + str(k+batch_limit-1)) 
            ids = [str(uuid4()) for _ in range(batch_limit)]
            #print(texts[k])
            #print(record_metadatas[k])
            try:
                embeds = embed.embed_documents(texts[k:k+batch_limit-1])
                batch_metadatas = record_metadatas[k:k+batch_limit-1]
                index.upsert(vectors=zip(ids, embeds, batch_metadatas))
            except:
                print('error, trying once again')
                embeds = embed.embed_documents(texts[k:k+batch_limit-1])
                batch_metadatas = record_metadatas[k:k+batch_limit-1]
                index.upsert(vectors=zip(ids, embeds, batch_metadatas))
            time.sleep(10)
        #print(texts)
        #print("IDs generated")
        #embeds = embed.embed_documents(texts[k])
        #
    



index.describe_index_stats()

# Time to test: ask some questions!

In [None]:
import openai
from langchain import PromptTemplate
from langchain.llms import OpenAI

OPENAI_KEY = "XXXXX"
openai = OpenAI(
   openai_api_key= OPENAI_KEY,
   model_name="text-davinci-003",
)



question_template = """
You are an expert in Industrial Control Systems cybersecurity.
The answer should be in your own words and should not exceed 300 words. The answer should be in English.
{context}
now based on above context answer the following question:
{question}

Answer:
"""

proper_prompt_template = PromptTemplate(
   input_variables=["context","question"],
   template=question_template
)

llm = OpenAI(openai_api_key = OPENAI_KEY,
             model_name="text-davinci-003")

question = "How should I configure nmap to perform scan on an industrial control systems environment?"

from langchain.vectorstores import Pinecone
text_field = "text"
# switch back to normal index for langchain
index = pinecone.Index('otsechotline')
vectorstore = Pinecone(
    index, embed.embed_query, text_field
)
context = vectorstore.similarity_search(
    question,  # our search query
    k=3  # return 3 most relevant docs
)
print(context)
better_prompt = proper_prompt_template.format(context=context,question=question)
standard_prompt = llm(question)
print("Standard prompt:\n",standard_prompt,"\n")
print("Proper prompt:\n",openai(better_prompt),"\n")

# Switching to GPT4

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
import pprint
import pinecone
from langchain.vectorstores import Pinecone
index = pinecone.Index('otsechotline')
vectorstore = Pinecone(
    index, embed.embed_query, text_field
)
context = vectorstore.similarity_search(
    question,  # our search query
    k=3  # return 3 most relevant docs
)



OPENAI_KEY = "XXXXX"
chat = ChatOpenAI(temperature=0, openai_api_key=OPENAI_KEY)

question = "What is a VFD?"

text_field = "text"
# switch back to normal index for langchain

context = vectorstore.similarity_search(
    question,  # our search query
    k=3  # return 2 most relevant docs
)
print(context[0])
print(context[1])
print(context[2])
messages = [
    SystemMessage(
        content="You are a helpful assistant that explains cybersecurity. All your answers must suggest the most secure way to do things. All the questions you receive are in the context of Industrial Control Systems."
    ),
    SystemMessage(
        content="You can answer based on the following trusted information : " + str(context)
    ),
    HumanMessage(
        content=question
    ),
]
pprint.pprint(chat(messages))

