# 2 - Updating pinecone

In [1]:
import pandas as pd
import json
import re
import unicodedata
import time
import datetime
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook

import openai
from openai.embeddings_utils import get_embedding
import pinecone
from telethon import TelegramClient
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

keys_path = '../keys/'
data_path = '../../TG_messages/'

  from tqdm.autonotebook import tqdm


In [2]:
# set to True if you want to save the pickle file (unreliable, probably due to different pandas versions, better to save to csv)
save_pickle = False

with open(keys_path+'api_keys.json') as f:
  data = json.loads(f.read())

# load TG credentials
api_id = data['api_id'] 
api_hash = data['api_hash']
phone = data['phone']

#load openai credentials
openai_key = data['openai_key']

# load pinecone credentials
pine_key = data['pine_key']
pine_env = data['pine_env']

Questions
1) Identify which data to download:
- by date
- by id
Anyway need to store last date or id. So let's keep it last_id.

2) Remove duplicates in pinecone 
- they should not be there as id is exactly channel + message_id

Steps (per each channel):
- identify last_id (channels.csv)
- download from TG as per last_id
- process messages: cleaning, deduplicating, summary
- create embeds from openai
- date format into int
- transform into pinecone format
- upsert into pinecone
- add into main files (pkl) - optional
- iterate over channels
- update last_id in channels.csv
- create session_stats file
- update total_stats file

## Functions

In [3]:
def clean_text(text):
    # Unicode range for emojis
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # Emoticons
                               "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                               "]+", flags=re.UNICODE)
    
    # Remove emojis
    text = emoji_pattern.sub(r'', str(text))
    # Regular expression for URLs
    url_pattern = re.compile(r"http\S+|www\S+")
    # Remove URLs
    text = url_pattern.sub(r'', str(text))
    # remove /n
    text = text.replace('\n', '')
    # Remove any remaining variation selectors
    text = ''.join(char for char in text if unicodedata.category(char) != 'Mn')

    #Remove Foreign Agent text    
    pattern = re.compile(r'[А-ЯЁ18+]{3,}\s[А-ЯЁ()]{5,}[^\n]*ИНОСТРАННОГО АГЕНТА')
    text = pattern.sub('', text)
    name1 = 'ПИВОВАРОВА АЛЕКСЕЯ ВЛАДИМИРОВИЧА'
    text = text.replace(name1, '')

    return text

In [4]:
# summarize the news (select 2 most important sentences)
def summarize(text, language="russian", sentences_count=2):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return ' '.join([str(sentence) for sentence in summary])

# NEED MORE FLEXIBLE MODEL
# summarize the news - need to keep length upto 750 characters

In [5]:
def process_new_messages(df, channel, stance):
    # add channel name & stance
    df.loc[:, 'channel'] = channel
    df.loc[:, 'stance'] = stance
    df.drop_duplicates(subset=['id'], inplace = True) # remove duplicates
    df.loc[:, 'cleaned_message'] = df['message'].apply(clean_text) #remove emojis, urls, foreign agent text
    df = df[~df.cleaned_message.str.len().between(0, 30)] #remove empty or too short messages
    # summarize cleaned_messages: 2 sentences if length > 750, 3 sentences if length > 1500
    df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)
    return df

In [6]:
#function to get new messages from channel
start_date = datetime.datetime(2023, 10, 1) # minimum date for TelegramClient

async def get_new_messages(channel, last_id, stance, start_date):
    async with TelegramClient('session', api_id, api_hash) as client:
        # COLLECT NEW MESSAGES
        data = [] # for collecting new messages
        # check if last_id is integer (=set)
        try:
            offset_id = int(last_id)
        except:
            offset_id = 0
        async for message in client.iter_messages(channel, reverse=True, offset_id=offset_id, offset_date=start_date):
            data.append(message.to_dict())
        # if no new messages, skip
    print(f"Channel: {channel}, N of new messages: {len(data)}")
    if len(data) == 0:
        return None
    # create df from collected data
    df = pd.DataFrame(data)
    # return df
    return df

In [7]:
# function for openai embeddings
def get_embeddings(df, text_col='summary', model="text-embedding-ada-002"):
    embeddings = []
    for text in df[text_col]:
        try:
            response = openai.Embedding.create(
                input=text,
                model=model
            )
            embeddings.append(response['data'][0]['embedding'])
        except openai.error.APIError as e:
            if e.status_code == 502:
                print("Bad gateway error, retrying...")
                time.sleep(5)
                response = openai.Embedding.create(
                    input=text,
                    model=model
                )
                embeddings.append(response['data'][0]['embedding'])
            else:
                raise e
    df['embeddings'] = embeddings
    print(f"Embeddings for {df.shape[0]} messages collected.")
    return df


In [8]:
def upsert_to_pinecone(df, index, batch_size=100):
    # create df for pinecone
    meta_col = ['cleaned_message', 'summary', 'stance', 'channel', 'date', 'views']
    #rename embeddings to values
    df4pinecone = df[meta_col+['id', 'embeddings']].copy()
    df4pinecone = df4pinecone.rename(columns={'embeddings': 'values'})
    # convert date to integer (as pinecone doesn't support datetime)
    df4pinecone['date'] = df4pinecone['date'].apply(lambda x: int(time.mktime(x.timetuple())))
    # id as channel_id + message_id (to avoid duplication and easier identification)
    df4pinecone['id'] = df4pinecone['channel'] + '_' + df4pinecone['id'].astype(str)
    # convert to pinecone format
    df4pinecone['metadata'] = df4pinecone[meta_col].to_dict('records')
    df4pinecone = df4pinecone[['id', 'values', 'metadata']]
    bath_size = batch_size
    for i in range(0, df4pinecone.shape[0], bath_size):
        index.upsert(vectors=df4pinecone.iloc[i:i+bath_size].to_dict('records'))
    print(f"Upserted {df4pinecone.shape[0]} records. Last id: {df4pinecone.iloc[-1]['id']}")

## Updating channels

In [9]:
# function to save new df to pickle
def save_to_pickle(df, channel):
    try:
        df_old = pd.read_pickle(data_path + channel + '.pkl')
    except:
        df_old = pd.DataFrame()
    if len(df) == 0:
        return
    pd.concat([df_old, df]).to_pickle(data_path + channel + '.pkl')

In [11]:
# init openai
openai.api_key = openai_key
# initialize pinecone
pinecone.init(api_key=pine_key, environment=pine_env)
index_name='tg-news'
pine_index = pinecone.Index(index_name)
# create session_stats
df_channel_stats = pd.DataFrame() # store N of posts per channel per day
session_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # to name session stats file

# ITERATE OVER CHANNELS (df_channels) TO UPDATE PINCONE INDEX
df_channels = pd.read_csv('channels.csv', sep = ';')
for i, channel, last_id, stance in tqdm_notebook(df_channels[['channel_name', 'last_id', 'stance']].itertuples(), total=df_channels.shape[0]):
    # get & clean new messages
    df = await get_new_messages(channel, last_id, stance, start_date=start_date)
    if df is None:
        continue
    # clean, summarize, add channel name & stance
    df = process_new_messages(df, channel, stance)
    # get embeddings with progress bar
    df = get_embeddings(df, text_col='summary', model="text-embedding-ada-002")
    # upsert to pinecone
    upsert_to_pinecone(df, pine_index)

    # save session stats for channel
    df_channel_stats[channel] = df['date'].dt.date.value_counts()
    df_channel_stats.to_csv(f'../session_stats/channel_stats_{session_time}.csv', sep=';', index=True)

    # update last_id in df_channels
    df_channels.loc[i, 'last_id'] = df['id'].max()
    df_channels.to_csv('channels.csv', index=False, sep=';')
    # save new messages to pickle (strange errors with pickle df, probably due to different pd versions)
    if save_pickle == True:
        save_to_pickle(df, channel)

  0%|          | 0/48 [00:00<?, ?it/s]

Channel: rt_russian, N of new messages: 45


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 36 messages collected.
Upserted 36 records. Last id: rt_russian_177165
Channel: ntvnews, N of new messages: 20


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 18 messages collected.
Upserted 18 records. Last id: ntvnews_122891
Channel: tvrussia1, N of new messages: 16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 8 messages collected.
Upserted 8 records. Last id: tvrussia1_24335
Channel: bbcrussian, N of new messages: 21
Embeddings for 21 messages collected.
Upserted 21 records. Last id: bbcrussian_54879
Channel: news_1tv, N of new messages: 12
Embeddings for 12 messages collected.
Upserted 12 records. Last id: news_1tv_25006
Channel: redakciya_channel, N of new messages: 6
Embeddings for 6 messages collected.
Upserted 6 records. Last id: redakciya_channel_26301
Channel: meduzalive, N of new messages: 21


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 15 messages collected.
Upserted 15 records. Last id: meduzalive_93944
Channel: mediazzzona, N of new messages: 17


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 6 messages collected.
Upserted 6 records. Last id: mediazzzona_13356
Channel: thebell_io, N of new messages: 5
Embeddings for 5 messages collected.
Upserted 5 records. Last id: thebell_io_24899
Channel: rian_ru, N of new messages: 53


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 48 messages collected.
Upserted 48 records. Last id: rian_ru_219256
Channel: readovkanews, N of new messages: 15
Embeddings for 15 messages collected.
Upserted 15 records. Last id: readovkanews_68291
Channel: novaya_pishet, N of new messages: 5
Embeddings for 5 messages collected.
Upserted 5 records. Last id: novaya_pishet_42325
Channel: rbc_news, N of new messages: 15
Embeddings for 15 messages collected.
Upserted 15 records. Last id: rbc_news_83323
Channel: zvezdanews, N of new messages: 34


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 26 messages collected.
Upserted 26 records. Last id: zvezdanews_132064
Channel: aifonline, N of new messages: 52


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 40 messages collected.
Upserted 40 records. Last id: aifonline_66691
Channel: BFMnews, N of new messages: 4
Embeddings for 4 messages collected.
Upserted 4 records. Last id: BFMnews_36109
Channel: fontankaspb, N of new messages: 16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 8 messages collected.
Upserted 8 records. Last id: fontankaspb_47166
Channel: forbesrussia, N of new messages: 19
Embeddings for 19 messages collected.
Upserted 19 records. Last id: forbesrussia_58209
Channel: gazetaru, N of new messages: 13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 12 messages collected.
Upserted 12 records. Last id: gazetaru_24140
Channel: interfaxonline, N of new messages: 17


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 14 messages collected.
Upserted 14 records. Last id: interfaxonline_37475
Channel: izvestia, N of new messages: 60


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 57 messages collected.
Upserted 57 records. Last id: izvestia_148560
Channel: kommersant, N of new messages: 14
Embeddings for 14 messages collected.
Upserted 14 records. Last id: kommersant_57706
Channel: lentadnya, N of new messages: 34


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 20 messages collected.
Upserted 20 records. Last id: lentadnya_94658
Channel: lifenews, N of new messages: 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 6 messages collected.
Upserted 6 records. Last id: lifenews_99527
Channel: mk_ru, N of new messages: 37


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 36 messages collected.
Upserted 36 records. Last id: mk_ru_44972
Channel: novaya_europe, N of new messages: 20


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 14 messages collected.
Upserted 14 records. Last id: novaya_europe_25457
Channel: radiosvoboda, N of new messages: 22
Embeddings for 22 messages collected.
Upserted 22 records. Last id: radiosvoboda_49694
Channel: rentv_news, N of new messages: 51


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 50 messages collected.
Upserted 50 records. Last id: rentv_news_114976
Channel: rgrunews, N of new messages: 16
Embeddings for 16 messages collected.
Upserted 16 records. Last id: rgrunews_89215
Channel: riafan, N of new messages: 0
Channel: rusvesnasu, N of new messages: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 2 messages collected.
Upserted 2 records. Last id: rusvesnasu_28147
Channel: svpressaru, N of new messages: 8
Embeddings for 8 messages collected.
Upserted 8 records. Last id: svpressaru_20911
Channel: tass_agency, N of new messages: 1038


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 994 messages collected.
Upserted 994 records. Last id: tass_agency_215780
Channel: truekpru, N of new messages: 657


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 456 messages collected.
Upserted 456 records. Last id: truekpru_134239
Channel: uranews, N of new messages: 351


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 321 messages collected.
Upserted 321 records. Last id: uranews_83171
Channel: vedomosti, N of new messages: 339


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 338 messages collected.
Upserted 338 records. Last id: vedomosti_38934
Channel: vestiru24, N of new messages: 836


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 801 messages collected.
Upserted 801 records. Last id: vestiru24_93748
Channel: tsargradtv, N of new messages: 746


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 630 messages collected.
Upserted 630 records. Last id: tsargradtv_60718
Channel: sashakots, N of new messages: 109


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 83 messages collected.
Upserted 83 records. Last id: sashakots_42934
Channel: rybar, N of new messages: 247


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 212 messages collected.
Upserted 212 records. Last id: rybar_53577
Channel: voenacher, N of new messages: 449


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 308 messages collected.
Upserted 308 records. Last id: voenacher_54999
Channel: vysokygovorit, N of new messages: 61


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 56 messages collected.
Upserted 56 records. Last id: vysokygovorit_13239
Channel: SolovievLive, N of new messages: 1286


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 1097 messages collected.
Upserted 1097 records. Last id: SolovievLive_216972
Channel: margaritasimonyan, N of new messages: 15


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 10 messages collected.
Upserted 10 records. Last id: margaritasimonyan_13339
Channel: breakingmash, N of new messages: 184


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 146 messages collected.
Upserted 146 records. Last id: breakingmash_48846
Channel: tvc_ru, N of new messages: 294


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 278 messages collected.
Upserted 278 records. Last id: tvc_ru_59693
Channel: strelkovii, N of new messages: 26


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)


Embeddings for 15 messages collected.
Upserted 15 records. Last id: strelkovii_6420
Channel: concordgroup_official, N of new messages: 0


In [14]:
pine_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.57829,
 'namespaces': {'': {'vector_count': 57829}},
 'total_vector_count': 57829}

In [None]:
# remove old messages from pinecone (to keep size < 100K)
remove_date = datetime.datetime.now() - datetime.timedelta(days=30)
remove_date = int(time.mktime(remove_date.timetuple())) # date into integer
remove_filter = {'date': {'$lt': remove_date}}
pine_index.delete(
    filter=remove_filter
    )

# does not work due to cpg-starter limitations. 2 options:
# 1. create new index on proper cluster and delete old one
# 2. delete messages by index