In [91]:
%load_ext autoreload
%autoreload 2

import json
import pandas as pd
from listennotes import podcast_api
from dotenv import load_dotenv
load_dotenv('./.env', override=True)
import os
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch
from typing import List, Dict, Tuple, Union
from tqdm import tqdm
import time
from math import ceil
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
from preprocess_helpers import Splitters
import openai
from retrieval import Retriever
from reranker import ReRanker
from tiktoken_functions import Tokenizer
import tiktoken

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 1.) Import Podcast Transcript

In [2]:
podcast_folder = '/home/elastic/notebooks/podcast_transcripts/'
podcasts = [os.path.join(podcast_folder, file) for file in os.listdir(podcast_folder) if file.endswith('.txt')]
len(podcasts)

100

In [3]:
def load_podcasts(files: List[str]) -> List[str]:
    transcripts = []
    for file in files:
        with open(file) as f:
            text = f.read().strip()
            transcripts.append(text)
    return transcripts

In [4]:
%%time
transcripts = load_podcasts(podcasts)

CPU times: user 2.73 ms, sys: 4.69 ms, total: 7.42 ms
Wall time: 7.18 ms


### 2a.) Split Text into Sentences - Spacy

In [9]:
from pysbd import Segmenter
import spacy

nlp = spacy.load('en_core_web_md')
biencoder = 'all-MiniLM-L6-v2'

In [43]:
doc = nlp(text)

In [14]:
def split_texts(texts: List[str]) -> List[List[str]]:
    nlp = spacy.load('en_core_web_md')
    all_texts = []
    start = time.perf_counter()
    for text in tqdm(texts, 'Texts'):
        doc = nlp(text)
        sentences = [str(sent).strip() for sent in doc.sents]
        # half = int(ceil(len(sentences)/2))
        # sentences[:half]
        all_texts.append(sentences)
    end = time.perf_counter() - start
    print(f'Total Time: {round(end/60, 2)} minutes')
    return all_texts

In [15]:
text_sentences = split_texts(transcripts)

Texts: 100%|███████████████████████████████████████████████████████████████████████| 100/100 [01:13<00:00,  1.35it/s]

Total Time: 1.23 minutes





In [25]:
pd.DataFrame([len(corpus) for corpus in text_sentences]).describe()

Unnamed: 0,0
count,100.0
mean,313.22
std,115.056032
min,162.0
25%,235.75
50%,267.5
75%,371.0
max,604.0


### 2b.) Split Text into Sentences - SentenceSplitter

In [5]:
%%time
split_sentences = [Splitters().split_into_sentences(text) for text in transcripts]

CPU times: user 513 ms, sys: 8.08 ms, total: 521 ms
Wall time: 520 ms


In [6]:
pd.DataFrame([len(corpus) for corpus in split_sentences]).describe()

Unnamed: 0,0
count,100.0
mean,313.76
std,114.630995
min,162.0
25%,236.75
50%,271.5
75%,371.75
max,604.0


In [7]:
for x in range(0,len(split_sentences),8):
    print(sum([len(sentence.split()) for sentence in split_sentences[0][x:x+8]]))

86
103
128
110
192
100
140
154
197
130
166
127
107


### 2c.) Split Text into Sentences - PyBSD

In [9]:
seg = Segmenter(clean=True)

In [10]:
# %%time
# results = [seg.segment(text) for text in tqdm(transcripts)]

In [11]:
def get_segment(text: str) -> List[str]:
    alist = seg.segment(text)
    return alist[0]

In [12]:
# get_segment(transcripts[0])

### 3. Group Sentences into chunks

In [8]:
def grouper(text_sentences: List[List[str]], sent_chunk_length: int=8) -> List[List[str]]:
    if isinstance(text_sentences[0], str):
        text_sentences = [text_sentences]
    chunks = []
    for corpus in text_sentences:
        for x in range(0,len(corpus),sent_chunk_length):
            achunk = []
            achunk.append(' '.join(corpus[x:x+sent_chunk_length]))
            chunks.append(achunk)
    return [string for alist in chunks for string in alist]
        

In [9]:
test = split_sentences[0][:240]

In [10]:
chunks = grouper(split_sentences)
chunks = grouper(test)

In [11]:
fd = pd.read_parquet('./test_chunks.parquet')
fd

Unnamed: 0,Chunks
0,Hello and welcome to The Intelligence from The...
1,"And not just any crisp, a particular flavor of..."
2,The battle between the FTC and Microsoft rolls...
3,Ms. Kahn isn't just aggressively going after b...
4,"But whatever your view, she is really shaking ..."
5,And so what does that approach look like in pr...
6,"A few weeks ago, I sat down with her to find o..."
7,How are the cases that they're bringing workin...
8,The most notable one occurred last year in Jul...
9,Kahn has argued that it could suppress competi...


In [12]:
df = pd.DataFrame(chunks, columns=['Chunks'])
# df.to_parquet('./test_chunks.parquet')
df.loc[0,'Chunks']

"Hello and welcome to The Intelligence from The Economist. I'm Aure Ogunbiyi. And I'm Jason Palmer. Every weekday we provide a fresh perspective on the events shaping your world. China was one of the last countries to abandon pandemic lockdowns, and investors and analysts alike were waiting anxiously for its recovery. The comeback came, but things aren't going quite the way that many expected. And what's the best possible fate for a potato? Our correspondent argues passionately that the answer is a crisp, a potato chip."

In [66]:
#create docs
docs = df.Chunks.values.tolist()
#create ids
ids = df.index.tolist()
dicts = [{'content': doc, 'name': 'Intelligent Economist'} for doc in docs]

### 4. Encode Chunks as Vectors

### 4a.) SentenceTransformers

In [14]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [15]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [25]:
vectors = model.encode(sentences=chunks, show_progress_bar=True, device='cuda:0')

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.93it/s]


In [26]:
np.save('./test_vectors.npy', vectors, allow_pickle=False)

In [70]:
db = model.encode(sentences)
query = 'discussion on taking chances in life'

### 4b.) OpenAI Ada Embeddings

In [None]:
openai.api_key = os.environ['OPENAI_API_KEY']
model = "text-embedding-ada-002"

#get cost first
tokenizer = Tokenizer(model_type="cl100k_base", price=0.001)

cost = tokenizer.get_cost(docs)

results = openai.Embedding.create(input=docs, engine=model)
vectors = results['data']
vectors = [vec['embedding'] for vec in vectors]
len(vectors)

### Indexing (Qdrant)

In [36]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

In [40]:
client = QdrantClient('localhost', port=6333)
collection = 'test_collection'

In [69]:
client.recreate_collection(collection_name=collection, vectors_config=VectorParams(size=1536, distance=Distance.COSINE))

True

In [46]:
# model = SentenceTransformer('models/my-128dim-model')

In [71]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='test_collection')])

In [70]:
client.upload_collection(collection_name=collection, vectors=vectors, payload=dicts, ids=ids, batch_size=256)

In [67]:
client.delete_collection(collection)

True

### Search (Qdrant)

In [102]:
retriever = Retriever(model_type=model)
reranker = ReRanker()

In [108]:
query = 'GOP thoughts on antitrust law'

In [109]:
%%time
results = retriever.search(query, collection=collection, limit=25, return_all=False)
sorted_scores = reranker.rerank(results, query)

CPU times: user 40.4 ms, sys: 7.15 ms, total: 47.6 ms
Wall time: 245 ms


In [112]:
import urllib.request
urllib.request.urlretrieve("https://artifacts.opensearch.org/models/ml-models/amazon/gpt/GPT2_xl_sqg/1.0.0/generate.py", "generate.py")

('generate.py', <http.client.HTTPMessage at 0x7fadcaceb1c0>)

In [113]:
ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 [0m[01;34mIRIS[0m/                            [01;34mpodcast_summary_demo[0m/
 [01;34mPC_Metrics[0m/                      [01;34mpodcast_transcripts[0m/
 [01;34m__pycache__[0m/                     podcast_vectors.npy
 bert_similarity.ipynb            preprocess_helpers.py
 calculating_similarities.ipynb   [01;34mqdrant_storage[0m/
 chunks.parquet                  'query_data(4).csv'
 [01;34mdata[0m/                            ray_data_practice.ipynb
 [01;34mdatasets[0m/                        reranker.py
 embeddings_helper.py             retrieval.py
 generate.py                      [01;34mroberta-stsb-cross-encoder[0m/
 install_kernel.sh                [01;34mspace[0m/
 [01;34mmodel

Bad pipe message: %s [b'\xf0{u\xbaQ_y\x16)(']
Bad pipe message: %s [b'\x88\xd3\xc9E\xc9\xf6 M\x96\xac"2{\xb6\x81y\x81g\x02\x19\x03"\xf54\xcc\xa0q\xc7\xa85\x19\xd9A\x8a\xdf\x00\x07\x9f\xbc\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00']
Bad pipe message: %s [b"U7A\xa4\x1f\xdbB\xd6`W\xeb\xbb\xfe\xdd\x13\xeb\xfc\xc9\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x00", b'\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00']
Bad pipe message: %s [b"\x19\xd0y\xa2\x8f.\xf7\x82v\x07\