# 2 - Updating pinecone

In [1]:
import pandas as pd
import json
import re
import unicodedata
import time
import datetime

import openai
import pinecone
from telethon import TelegramClient
from tqdm.notebook import tqdm_notebook
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

keys_path = '../keys/'
data_path = '../../TG_messages/'

  from tqdm.autonotebook import tqdm


In [2]:
with open(keys_path+'api_keys.json') as f:
  data = json.loads(f.read())

# load TG credentials
api_id = data['api_id'] 
api_hash = data['api_hash']
phone = data['phone']

#load openai credentials
openai_key = data['openai_key']

# load pinecone credentials
pine_key = data['pine_key']
pine_env = data['pine_env']

Questions
1) Identify which data to download:
- by date
- by id
Anyway need to store last date or id. So let's keep it last_id.

2) Remove duplicates in pinecone 
- they should not be there as id is exactly channel + message_id

Steps (per each channel):
- identify last_id (channels.csv)
- download from TG as per last_id
- process messages: cleaning, deduplicating, summary
- create embeds from openai
- date format into int
- transform into pinecone format
- upsert into pinecone
- add into main files (pkl) - optional
- iterate over channels
- update last_id in channels.csv
- create session_stats file
- update total_stats file

In [3]:
def clean_text(text):
    # Unicode range for emojis
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # Emoticons
                               "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
                               "]+", flags=re.UNICODE)
    
    # Remove emojis
    text = emoji_pattern.sub(r'', str(text))

    # Regular expression for URLs
    url_pattern = re.compile(r"http\S+|www\S+")
        
    # Remove URLs
    text = url_pattern.sub(r'', str(text))
    
    # Remove any remaining variation selectors
    text = ''.join(char for char in text if unicodedata.category(char) != 'Mn')

    #Remove Foreign Agent text    
    pattern = re.compile(r'[А-ЯЁ18+]{3,}\s[А-ЯЁ()]{5,}[^\n]*ИНОСТРАННОГО АГЕНТА')
    text = pattern.sub('', text)
    name1 = 'ПИВОВАРОВА АЛЕКСЕЯ ВЛАДИМИРОВИЧА'
    text = text.replace(name1, '')

    return text

In [4]:
# summarize the news (select 2 most important sentences)
def summarize(text, language="russian", sentences_count=2):
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count)
    return ' '.join([str(sentence) for sentence in summary])

# NEED MORE FLEXIBLE MODEL
# summarize the news - need to keep length upto 750 characters

In [5]:
def process_new_messages(df, channel, stance):
    df = df.drop_duplicates(subset=['id']).copy() # create a copy of the DataFrame before modifying it
    df.loc[:, 'cleaned_message'] = df['message'].apply(clean_text) #remove emojis, urls, foreign agent text
    df = df[~df.cleaned_message.str.len().between(0, 30)] #remove empty or too short messages
    # summarize cleaned_messages: 2 sentences if length > 750, 3 sentences if length > 1500
    df.loc[:, 'summary'] = df['cleaned_message'].apply(lambda x: summarize(x, sentences_count=3) if len(x) > 750 else summarize(x, sentences_count=2) if len(x) > 500 else x)
    # add channel name & stance
    df.loc[:, 'channel'] = channel
    df.loc[:, 'stance'] = stance
    return df

In [6]:
#function to get new messages from channel
async def get_new_messages(channel, last_id, stance):
    async with TelegramClient('session', api_id, api_hash) as client:
        # COLLECT NEW MESSAGES
        data = [] # for collecting new messages
        # check if last_id is integer (=set)
        try:
            offset_id = int(last_id)
        except:
            offset_id = 0
        async for message in client.iter_messages(channel, reverse=True, offset_id=offset_id):
            data.append(message.to_dict())
        # if no new messages, skip
    print(f"Channel: {channel}, N of new messages: {len(data)}")
    if len(data) == 0:
        return None
    # create df from collected data
    df = pd.DataFrame(data)
    # process new messages
    df = process_new_messages(df, channel, stance)
    # return df
    return df

In [7]:
# function for openai embeddings
def get_embeddings(text, model="text-embedding-ada-002"):
    response = openai.Embedding.create(
        input=text,
        model=model
    )
    embeddings = response['data'][0]['embedding']
    return embeddings

In [8]:
def upsert_to_pinecone(df, index, batch_size=100):
    # convert date to integer (without time)
    df['date'] = df['date'].apply(lambda x: int(time.mktime(x.timetuple())))
    # id as channel_id + message_id
    df['id'] = df['channel'] + '_' + df['id'].astype(str)
    # convert to pinecone format
    df['metadata'] = df[['cleaned_message', 'summary', 'stance', 'channel', 'date', 'views']].to_dict('records')
    df = df[['id', 'values', 'metadata']]
    bath_size = batch_size
    for i in range(0, df.shape[0], bath_size):
        index.upsert(vectors=df.iloc[i:i+bath_size].to_dict('records'))
    print(f"Upserted {df.shape[0]} records. Last id: {df.iloc[-1]['id']}")

In [None]:
# init openai
openai.api_key = openai_key
# initialize pinecone
pinecone.init(api_key=pine_key, environment=pine_env)
index_name='tg-news'
index = pinecone.Index(index_name)
# create session_stats
df_channel_stats = pd.DataFrame() # store N of posts per channel per day
session_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # to name session stats file

# ITERATE OVER CHANNELS (df_channels) TO UPDATE PINCONE INDEX
df_channels = pd.read_csv('/Users/alexbadin/Library/CloudStorage/GoogleDrive-alex.badin@gmail.com/My Drive/Colab Notebooks/Narratives/notebooks_data_ingest/channels.csv', sep = ';')
for i, channel, last_id, stance in tqdm_notebook(df_channels[['channel_name', 'last_id', 'stance']].itertuples(), total=df_channels.shape[0]):
    # get & clean new messages
    df = await get_new_messages(channel, last_id, stance)
    if df is None:
        continue
    # get embeddings
    df.loc[:, 'values'] = df['cleaned_message'].apply(get_embeddings)
    print(f"Embeddings for {df.shape[0]} messages collected.")
    # upsert to pinecone
    upsert_to_pinecone(df, index)

    # save session stats for channel
    df_channel_stats[channel] = df['date'].dt.date.value_counts()
    df_channel_stats.to_csv(f'../session_stats/channel_stats_{session_time}.csv', sep=';', index=True)

    # update last_id in df_channels
    df_channels.loc[i, 'last_id'] = df['id'].max()
    df_channels.to_csv('channels.csv', index=False, sep=';')
    # save df to pickle
    df_old = pd.read_pickle(data_path + channel + '.pkl')
    df_new = pd.concat([df_old, df])
    df_new.to_pickle(data_path + channel + '.pkl')

In [None]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.41813,
 'namespaces': {'': {'vector_count': 41813}},
 'total_vector_count': 41813}