In [1]:
import pinecone
import json
from tqdm.notebook import tqdm_notebook
from tqdm import tqdm
import os
import pandas as pd
import datetime
import time


keys_path = '../keys/'
data_path = '../../TG_messages/'

  from tqdm.autonotebook import tqdm


In [2]:
with open(keys_path+'api_keys.json') as f:
  data = json.loads(f.read())

#load openai credentials
openai_key = data['openai_key']

# load pinecone credentials
pine_key = data['pine_key']
pine_env = data['pine_env']

# 1 - Create pinecone DB

## Collect downloaded dataset

In [3]:
# collect all df's' into json

# set start date
start_date = pd.Timestamp(datetime.datetime.now() - datetime.timedelta(days=30), tz='UTC')

files = os.listdir(data_path)
df_list = []
missed_channels = []

files = os.listdir(data_path)
for file in tqdm_notebook(files):
    if file.endswith('.pkl'):
        # try to load df
        try:
            df_temp = pd.read_pickle(data_path+file)
        except: 
            missed_channels.append(file)
            continue
        # if df empty, skip
        if df_temp.shape[0] == 0:
            continue
        # select data for last 30 days
        df_temp = df_temp[df_temp['date'] >= start_date]
        # id as channel_id + message_id
        df_temp['id'] = df_temp['channel'] + '_' + df_temp['id'].astype(str)
        df_temp = df_temp[['id', 'channel','stance', 'date', 'message', 'views', 'cleaned_message', 'summary', 'embeddings']]
        df_list.append(df_temp)

  0%|          | 0/47 [00:00<?, ?it/s]

In [4]:
missed_channels

[]

In [5]:
df4pine = pd.concat(df_list)
df4pine.shape, df4pine.date.min(), df4pine.date.max()

((40714, 9),
 Timestamp('2023-10-01 00:00:15+0000', tz='UTC'),
 Timestamp('2023-10-17 12:19:13+0000', tz='UTC'))

In [6]:
df4pine.embeddings.isnull().sum() / df4pine.shape[0]

0.24728594586628677

It appears most of news were processed without boost script

## Creating openai embeds
To create embeddings faster need json file as per requs in api_request_parallel_processor.py

In [None]:
# df4pine to json in following format:
# {"model": "text-embedding-ada-002", "input": text}
model = "text-embedding-ada-002"
# convert df4pine to json
json_list = []
for i in tqdm(range(df4pine.shape[0])):
    json_dict = {}
    json_dict["model"] = model
    json_dict["input"] = df4pine.iloc[i]['summary']
    json_dict["id"] = df4pine.iloc[i]['id']
    json_list.append(json_dict)

# save json_list to jsonl
import json
with open('df4pine.jsonl', 'w') as f:
    for item in json_list:
        json.dump(item, f)
        f.write('\n')

Embeds were acquired via script and saved into df4pine_embeds.jsonl

### check what data we have

In [17]:
# open df4pine.jsonl as df
df4pine_jsonl = pd.read_json('df4pine_embeds.jsonl', lines=True)
df4pine_jsonl.shape

(40714, 2)

In [18]:
df4pine_jsonl[0][1]

{'model': 'text-embedding-ada-002',
 'input': 'ХАМАС также заявил о запуске сотни ракет в направлении Ашкелона на юге Израиля. ▪Иранские службы безопасности помогли палестинскому исламистскому движению ХАМАС спланировать атаку на Израиль, сообщает The Wall Street Journal. ▪Власти Израиля после нападения со стороны палестинского движения ХАМАС обратились к США с просьбой помочь с пополнением запасов ракет-перехватчиков для системы противоракетной обороны «Железный купол», бомб малого диаметра и боеприпасов для пулеметов, сообщает The Washington Post.',
 'id': 'rbc_news_82590'}

In [None]:
df4pine_jsonl[1][0].keys()

dict_keys(['object', 'data', 'model', 'usage'])

In [15]:
df4pine_jsonl[1][0]['data'][-1].keys()

dict_keys(['object', 'index', 'embedding'])

## prepare data for pinecone
- date to int
- add new embeds from json
- convert to pinecone format (id, values, metadata)

In [13]:
# convert date to integer (without time)
df4pine['date'] = df4pine['date'].dt.date
df4pine['date'] = df4pine['date'].apply(lambda x: int(time.mktime(x.timetuple())))

### Combine all embeds with df4pine

In [19]:
# create df from df4pine_jsonl with id and embeddings
df_new_embeds = pd.DataFrame()
df_new_embeds['id_new'] = df4pine_jsonl[0].apply(lambda x: x['id'])
df_new_embeds['values'] = df4pine_jsonl[1].apply(lambda x: x['data'][-1]['embedding'])
# add embeddings from df4pine_jsonl to df4pine
df4pine = df4pine.merge(df_new_embeds, left_on='id', right_on='id_new', how='left')

In [20]:
# check if id's & embeddings are same
df4pine.id.equals(df4pine.id_new), df4pine[:1000].embeddings.equals(df4pine[:1000].values)

(True, False)

Embeddings do not match. Because initially were calculated on cleaned message and next time on summaries.

In [21]:
# need only columns: id, embeddings and meta as dictionary of clean_message, summary, stance, channel, date, views
df4pine['metadata'] = df4pine[['cleaned_message', 'summary', 'stance', 'channel', 'date', 'views']].to_dict('records')
df4pine.drop(columns=['id_new', 'embeddings', 'message', 'cleaned_message', 'summary', 'stance', 'channel', 'date', 'views'], inplace=True)

In [22]:
print(df4pine.shape)
df4pine.tail(3)

(40714, 3)


Unnamed: 0,id,values,metadata
40711,readovkanews_67921,"[-0.03213842, -0.011505554000000001, 0.0026594...",{'cleaned_message': 'Владимир Путин провел вст...
40712,readovkanews_67922,"[0.012027255, 0.008897582000000001, -0.0068607...",{'cleaned_message': 'Статью экс-помощника през...
40713,readovkanews_67923,"[-0.027030565000000003, -0.012685874000000001,...",{'cleaned_message': 'В Химках дети мигрантов б...


In [23]:
df4pine.metadata[0]

{'cleaned_message': '17-ого октября в 15:45 в Московском городском суде состоится судебное заседание по поводу апелляции И.И. Стрелкова.',
 'summary': '17-ого октября в 15:45 в Московском городском суде состоится судебное заседание по поводу апелляции И.И. Стрелкова.',
 'stance': 'voenkor',
 'channel': 'strelkovii',
 'date': 1696881600,
 'views': 283345.0}

## Upsert to pinecone

In [24]:
pinecone.init(api_key=pine_key, environment=pine_env)
index_name = 'tg-news'

index = pinecone.Index(index_name)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.32058,
 'namespaces': {'': {'vector_count': 32058}},
 'total_vector_count': 32058}

In [25]:
# describe index
pinecone.describe_index('tg-news')

IndexDescription(name='tg-news', metric='cosine', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='starter', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

In [26]:
bath_size = 100
for i in tqdm_notebook(range(0, df4pine.shape[0], bath_size)):
    index.upsert(vectors=df4pine.iloc[i:i+bath_size].to_dict('records'))

  0%|          | 0/408 [00:00<?, ?it/s]

In [27]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.41813,
 'namespaces': {'': {'vector_count': 41813}},
 'total_vector_count': 41813}

In [93]:
df4pine['id'][0].split('_')

['strelkovii', '6386']

In [96]:
# get last digits from id
df4pine['id'].apply(lambda x: x.split('_')[-1]).sample(100)
df4pine['id'].apply(lambda x: '_'.join(x.split('_')[:-1])).sample(100)

2410        lentadnya
32386         BFMnews
33806      rentv_news
5592         izvestia
28922      zvezdanews
             ...     
34640      rentv_news
894      breakingmash
37198     tass_agency
20051        truekpru
28485      zvezdanews
Name: id, Length: 100, dtype: object

# 2 - Updating pinecone

In [51]:
import pandas as pd
import json
import re
import unicodedata
import time

import openai
import pinecone
from telethon import TelegramClient
from tqdm.notebook import tqdm_notebook
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

keys_path = '../keys/'
data_path = '../../TG_messages/'

In [33]:
with open(keys_path+'api_keys.json') as f:
  data = json.loads(f.read())

# load TG credentials
api_id = data['api_id'] 
api_hash = data['api_hash']
phone = data['phone']

#load openai credentials
openai_key = data['openai_key']

# load pinecone credentials
pine_key = data['pine_key']
pine_env = data['pine_env']

Questions
1) Identify which data to download:
- by date
- by id
Anyway need to store last date or id. So let's keep it last_id.

2) Remove duplicates in pinecone 
- they should not be there as id is exactly channel + message_id

Steps (per each channel):
- identify last_id (channels.csv)
- download from TG as per last_id
- process messages: cleaning, deduplicating, summary
- create embeds from openai
- transform into pinecone format
- upsert into pinecone
- update last_id in channels.csv
- create session_stats file
- update total_stats file

In [3]:
df_channels = pd.read_csv('channels.csv', sep=';')
df_channels.head(3)

Unnamed: 0,link,channel_name,channel_code,last_id,ignore,media_type,website,content_type,audience size,stance
0,https://t.me/rt_russian,rt_russian,RT,176223,,tv,rt.com,propaganda,105.0,tv
1,https://t.me/ntvnews,ntvnews,NTV,122479,,tv,ntv.ru,propaganda,,tv
2,https://t.me/tvrussia1,tvrussia1,ROS1,24043,,tv,,propaganda,,tv


In [4]:
def clean_text(text):
    # Unicode range for emojis
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # Emoticons
                               "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                               "]+", flags=re.UNICODE)
    
    # Remove emojis
    text = emoji_pattern.sub(r'', str(text))

    # Regular expression for URLs
    url_pattern = re.compile(r"http\S+|www\S+")
        
    # Remove URLs
    text = url_pattern.sub(r'', str(text))
    
    # Remove any remaining variation selectors
    text = ''.join(char for char in text if unicodedata.category(char) != 'Mn')

    #Remove Foreign Agent text    
    pattern = re.compile(r'[А-ЯЁ18+]{3,}\s[А-ЯЁ()]{5,}[^\n]*ИНОСТРАННОГО АГЕНТА')
    text = pattern.sub('', text)
    name1 = 'ПИВОВАРОВА АЛЕКСЕЯ ВЛАДИМИРОВИЧА'
    text = text.replace(name1, '')

    return text

In [5]:
# summarize the news (select 2 most important sentences)
def summarize(text, language="russian", sentences_count=2):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return ' '.join([str(sentence) for sentence in summary])

# NEED MORE FLEXIBLE MODEL
# summarize the news - need to keep length upto 750 characters

In [7]:
def process_new_messages(df, channel, stance):
    df = df.drop_duplicates(subset=['id']).copy() # create a copy of the DataFrame before modifying it
    df.loc[:, 'cleaned_message'] = df['message'].apply(clean_text) #remove emojis, urls, foreign agent text
    df = df[~df.cleaned_message.str.len().between(0, 30)] #remove empty or too short messages
    # summarize cleaned_messages: 2 sentences if length > 750, 3 sentences if length > 1500
    df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)
    # add channel name & stance
    df.loc[:, 'channel'] = channel
    df.loc[:, 'stance'] = stance
    return df

In [13]:
i = 0
channel = df_channels.iloc[i]['channel_name']    
last_id = df_channels.iloc[i]['last_id']
stance = df_channels.iloc[i]['stance']
start_date = datetime.datetime(2023, 10, 1) # minimum date for TelegramClient

#function to get new messages from channel
async def get_new_messages(channel, last_id, start_date):
    async with TelegramClient('session', api_id, api_hash) as client:
        # COLLECT NEW MESSAGES
        data = [] # for collecting new messages
        # check if last_id is integer (=set)
        try:
            offset_id = int(last_id)
        except:
            offset_id = 0
        async for message in client.iter_messages(channel, reverse=True, offset_id=offset_id, offset_date=start_date):
            data.append(message.to_dict())
        # if no new messages, skip
    print(f"Channel: {channel}, N of new messages: {len(data)}")
    if len(data) == 0:
        return None
    # create df from collected data
    df = pd.DataFrame(data)
    # process new messages
    df = process_new_messages(df, channel, stance)
    # return df
    return df

In [27]:
openai.api_key = openai_key

# function for openai embeddings
def get_embeddings(text, model="text-embedding-ada-002"):
    response = openai.Embedding.create(
        input=text,
        model=model
    )
    embeddings = response['data'][0]['embedding']
    print(f"Embeddings shape: {len(embeddings)} for channel: {channel}")
    return embeddings

In [23]:
i = 0
channel = df_channels.iloc[i]['channel_name']    
last_id = df_channels.iloc[i]['last_id']
stance = df_channels.iloc[i]['stance']
df = await get_new_messages(channel, last_id)
df['embeddings'] = df['summary'].apply(get_embeddings)

Channel: rt_russian, N of new messages: 707


In [None]:
# convert date to integer (without time)
df['date'] = df['date'].apply(lambda x: int(time.mktime(x.timetuple())))
df['date'] = df['date'].dt.date

In [40]:
index_name='tg-news'
batch_size=10

for i in tqdm_notebook(range(0, df.shape[0], batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, df.shape[0])
    # get batch of IDs, embeds and metadata
    ids_batch = df['id'][i: i_end]
    embeds_batch = df['embeddings'][i: i_end]
    # prep metadata: 'channel', 'date', 'message', 'views', 'cleaned_message', 'summary'
    meta_cols = ['channel', 'stance', 'date', 'message', 'views', 'cleaned_message', 'summary']
    meta_batch = df[meta_cols][i: i_end].to_dict(orient='records')
    
    # upsert to Pinecone 
    to_upsert = zip(ids_batch, embeds_batch, meta_batch)

  0%|          | 0/61 [00:00<?, ?it/s]

In [49]:
pd.DataFrame(meta_batch)

Unnamed: 0,channel,stance,date,message,views,cleaned_message,summary
0,rt_russian,tv,2023-10-22 15:15:36+00:00,"Три украинские ракеты, летевшие в сторону Крым...",68235,"Три украинские ракеты, летевшие в сторону Крым...","Три украинские ракеты, летевшие в сторону Крым..."
1,rt_russian,tv,2023-10-22 15:22:27+00:00,Волонтёры спасли более 100 собак от ужасных ус...,68211,Волонтёры спасли более 100 собак от ужасных ус...,"Об «Островке надежды», который теперь называет..."
2,rt_russian,tv,2023-10-22 15:26:18+00:00,"ЦАХАЛ заявил, что израильский танк случайно об...",70475,"ЦАХАЛ заявил, что израильский танк случайно об...","ЦАХАЛ заявил, что израильский танк случайно об..."
3,rt_russian,tv,2023-10-22 15:40:01+00:00,"«Помогите нам, дайте нам попасть в Египет. Чег...",62104,"«Помогите нам, дайте нам попасть в Египет. Чег...",Ни одному жителю сектора Газа с иностранным па...
4,rt_russian,tv,2023-10-22 16:00:33+00:00,Госдеп США поручил части американского дипперс...,48859,Госдеп США поручил части американского дипперс...,Госдеп США поручил части американского дипперс...
5,rt_russian,tv,2023-10-22 16:09:05+00:00,"Илон Маск заявил, что даст $1 млрд Википедии, ...",42780,"Илон Маск заявил, что даст $1 млрд Википедии, ...","Илон Маск заявил, что даст $1 млрд Википедии, ..."
6,rt_russian,tv,2023-10-22 16:29:58+00:00,В Москве задержали 31-летнего мужчину по делу ...,16274,В Москве задержали 31-летнего мужчину по делу ...,В Москве задержали 31-летнего мужчину по делу ...
7,rt_russian,tv,2023-10-22 16:34:53+00:00,Египетские пограничники получили легкие ранени...,8139,Египетские пограничники получили легкие ранени...,Египетские пограничники получили легкие ранени...


In [38]:
pinecone.init(api_key=pine_key, environment=pine_env)

def upsert_embeddings_to_pinecone(df, index_name='tg-news', batch_size=10):
    # connect to Pinecone index
    index = pinecone.Index(index_name)
    
    # upsert embeddings in batches
    for i in tqdm_notebook(range(0, df.shape[0], batch_size)):
        # set end position of batch
        i_end = min(i+batch_size, df.shape[0])
        # get batch of IDs, embeds and metadata
        ids_batch = df['id'][i: i_end]
        embeds_batch = df['embeddings'][i: i_end]
        # prep metadata: 'channel', 'date', 'message', 'views', 'cleaned_message', 'summary'
        meta_cols = ['channel', 'stance', 'date', 'message', 'views', 'cleaned_message', 'summary']
        meta_batch = df[meta_cols][i: i_end].to_dict(orient='records')
        
        # upsert to Pinecone 
        to_upsert = zip(ids_batch, embeds_batch, meta_batch)
    index.upsert(vectors=list(to_upsert))
    print(f"Upserted {df.shape[0]} embeddings to Pinecone index {index_name}")

need functions: 
1) get & process messages (clean, get summary)
2) get openai embeds
3) upsert

In [17]:
i = 0
channel = df_channels.iloc[i]['channel_name']    
last_id = df_channels.iloc[i]['last_id']
stance = df_channels.iloc[i]['stance']
df = get_messages(channel, last_id)

In [None]:
df = process_new_messages(df, channel, stance)

In [None]:
# iterate over channels.csv and get last message_id for each channel
df_channels = pd.read_csv('channels.csv')
for i in tqdm_notebook(range(df_channels.shape[0])):
    channel = df_channels.iloc[i]['channel']    
    last_id = df_channels.iloc[i]['last_id']
    stance = df_channels.iloc[i]['stance']
    df = get_messages(channel, last_id)