In [1]:
import pinecone
import json
from tqdm.notebook import tqdm_notebook
from tqdm import tqdm
import os
import pandas as pd
import datetime
import time


keys_path = '../keys/'
data_path = '../../TG_messages/'

  from tqdm.autonotebook import tqdm


In [2]:
with open(keys_path+'api_keys.json') as f:
  data = json.loads(f.read())

#load openai credentials
openai_key = data['openai_key']

# load pinecone credentials
pine_key = data['pine_key']
pine_env = data['pine_env']

# Prepare data for pinecone

In [3]:
# collect all df's' into json

# set start date
start_date = pd.Timestamp(datetime.datetime.now() - datetime.timedelta(days=30), tz='UTC')

files = os.listdir(data_path)
df_list = []
missed_channels = []

files = os.listdir(data_path)
for file in tqdm_notebook(files):
    if file.endswith('.pkl'):
        # try to load df
        try:
            df_temp = pd.read_pickle(data_path+file)
        except: 
            missed_channels.append(file)
            continue
        # if df empty, skip
        if df_temp.shape[0] == 0:
            continue
        # select data for last 30 days
        df_temp = df_temp[df_temp['date'] >= start_date]
        # id as channel_id + message_id
        df_temp['id'] = df_temp['channel'] + '_' + df_temp['id'].astype(str)
        df_temp = df_temp[['id', 'channel','stance', 'date', 'message', 'views', 'cleaned_message', 'summary', 'embeddings']]
        df_list.append(df_temp)

  0%|          | 0/47 [00:00<?, ?it/s]

In [4]:
missed_channels

[]

In [5]:
df4pine = pd.concat(df_list)
df4pine.shape, df4pine.date.min(), df4pine.date.max()

((40714, 9),
 Timestamp('2023-10-01 00:00:15+0000', tz='UTC'),
 Timestamp('2023-10-17 12:19:13+0000', tz='UTC'))

In [6]:
df4pine.embeddings.isnull().sum() / df4pine.shape[0]

0.24728594586628677

It appears most of news were processed without boost script

## Creating openai embeds
To create embeddings faster need json file as per requs in api_request_parallel_processor.py

In [None]:
# df4pine to json in following format:
# {"model": "text-embedding-ada-002", "input": text}
model = "text-embedding-ada-002"
# convert df4pine to json
json_list = []
for i in tqdm(range(df4pine.shape[0])):
    json_dict = {}
    json_dict["model"] = model
    json_dict["input"] = df4pine.iloc[i]['summary']
    json_dict["id"] = df4pine.iloc[i]['id']
    json_list.append(json_dict)

# save json_list to jsonl
import json
with open('df4pine.jsonl', 'w') as f:
    for item in json_list:
        json.dump(item, f)
        f.write('\n')

Embeds were acquired via script and saved into df4pine_embeds.jsonl

### check what data we have

In [17]:
# open df4pine.jsonl as df
df4pine_jsonl = pd.read_json('df4pine_embeds.jsonl', lines=True)
df4pine_jsonl.shape

(40714, 2)

In [18]:
df4pine_jsonl[0][1]

{'model': 'text-embedding-ada-002',
 'input': 'ХАМАС также заявил о запуске сотни ракет в направлении Ашкелона на юге Израиля. ▪Иранские службы безопасности помогли палестинскому исламистскому движению ХАМАС спланировать атаку на Израиль, сообщает The Wall Street Journal. ▪Власти Израиля после нападения со стороны палестинского движения ХАМАС обратились к США с просьбой помочь с пополнением запасов ракет-перехватчиков для системы противоракетной обороны «Железный купол», бомб малого диаметра и боеприпасов для пулеметов, сообщает The Washington Post.',
 'id': 'rbc_news_82590'}

In [None]:
df4pine_jsonl[1][0].keys()

dict_keys(['object', 'data', 'model', 'usage'])

In [15]:
df4pine_jsonl[1][0]['data'][-1].keys()

dict_keys(['object', 'index', 'embedding'])

## prepare data for pinecone
- date to int
- add new embeds from json
- convert to pinecone format (id, values, metadata)

In [13]:
# convert date to integer (without time)
df4pine['date'] = df4pine['date'].dt.date
df4pine['date'] = df4pine['date'].apply(lambda x: int(time.mktime(x.timetuple())))

### Combine new (full) embeds with df4pine

In [19]:
# create df from df4pine_jsonl with id and embeddings
df_new_embeds = pd.DataFrame()
df_new_embeds['id_new'] = df4pine_jsonl[0].apply(lambda x: x['id'])
df_new_embeds['values'] = df4pine_jsonl[1].apply(lambda x: x['data'][-1]['embedding'])
# add embeddings from df4pine_jsonl to df4pine
df4pine = df4pine.merge(df_new_embeds, left_on='id', right_on='id_new', how='left')

In [20]:
# check if id's & embeddings are same
df4pine.id.equals(df4pine.id_new), df4pine[:1000].embeddings.equals(df4pine[:1000].values)

(True, False)

Embeddings do not match. Because initially were calculated on cleaned message and next time on summaries.

In [21]:
# need only columns: id, embeddings and meta as dictionary of clean_message, summary, stance, channel, date, views
df4pine['metadata'] = df4pine[['cleaned_message', 'summary', 'stance', 'channel', 'date', 'views']].to_dict('records')
df4pine.drop(columns=['id_new', 'embeddings', 'message', 'cleaned_message', 'summary', 'stance', 'channel', 'date', 'views'], inplace=True)

In [22]:
print(df4pine.shape)
df4pine.tail(3)

(40714, 3)


Unnamed: 0,id,values,metadata
40711,readovkanews_67921,"[-0.03213842, -0.011505554000000001, 0.0026594...",{'cleaned_message': 'Владимир Путин провел вст...
40712,readovkanews_67922,"[0.012027255, 0.008897582000000001, -0.0068607...",{'cleaned_message': 'Статью экс-помощника през...
40713,readovkanews_67923,"[-0.027030565000000003, -0.012685874000000001,...",{'cleaned_message': 'В Химках дети мигрантов б...


In [23]:
df4pine.metadata[0]

{'cleaned_message': '17-ого октября в 15:45 в Московском городском суде состоится судебное заседание по поводу апелляции И.И. Стрелкова.',
 'summary': '17-ого октября в 15:45 в Московском городском суде состоится судебное заседание по поводу апелляции И.И. Стрелкова.',
 'stance': 'voenkor',
 'channel': 'strelkovii',
 'date': 1696881600,
 'views': 283345.0}

# upsert to pinecone

In [24]:
pinecone.init(api_key=pine_key, environment=pine_env)
index_name = 'tg-news'

index = pinecone.Index(index_name)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.32058,
 'namespaces': {'': {'vector_count': 32058}},
 'total_vector_count': 32058}

In [25]:
# describe index
pinecone.describe_index('tg-news')

IndexDescription(name='tg-news', metric='cosine', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='starter', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

In [26]:
bath_size = 100
for i in tqdm_notebook(range(0, df4pine.shape[0], bath_size)):
    index.upsert(vectors=df4pine.iloc[i:i+bath_size].to_dict('records'))

  0%|          | 0/408 [00:00<?, ?it/s]

In [27]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.41813,
 'namespaces': {'': {'vector_count': 41813}},
 'total_vector_count': 41813}

In [93]:
df4pine['id'][0].split('_')

['strelkovii', '6386']

In [96]:
# get last digits from id
df4pine['id'].apply(lambda x: x.split('_')[-1]).sample(100)
df4pine['id'].apply(lambda x: '_'.join(x.split('_')[:-1])).sample(100)

2410        lentadnya
32386         BFMnews
33806      rentv_news
5592         izvestia
28922      zvezdanews
             ...     
34640      rentv_news
894      breakingmash
37198     tass_agency
20051        truekpru
28485      zvezdanews
Name: id, Length: 100, dtype: object