In [60]:
from telethon.sync import TelegramClient
import asyncio
import pandas as pd
import nest_asyncio
from telethon.tl.types import MessageMediaPoll
from telethon.tl.types import MessageReactions
import re
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm 
nest_asyncio.apply()

In [62]:
limit = 4000
async def get_messages():
    message_data = []
    async with TelegramClient('anon', api_id, api_hash) as client:
        # Get the total number of messages to fetch
        total_messages = limit

        # Create the progress bar
        progress_bar = tqdm(total=total_messages, desc='Fetching Messages', unit='message')

        async for message in client.iter_messages('TheStraitsTimes', limit=limit):
            # Extract the reaction details
            reaction_data = []
            if message.reactions and isinstance(message.reactions, MessageReactions):
                for reaction in message.reactions.results:
                    emoticon = reaction.reaction.emoticon
                    count = reaction.count
                    reaction_data.append({emoticon: count})

            # Each message is a row of data
            row = {
                'message_id': message.id,
                'date': message.date,
                'text': message.text,
                'reactions': reaction_data,
            }
            # Add the row to the list
            message_data.append(row)

            # Update the progress bar
            progress_bar.update()

    # Close the progress bar after the loop is finished
    progress_bar.close()

    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(message_data)
    return df

# Running the async function using asyncio.run
new_loop = asyncio.new_event_loop()
asyncio.set_event_loop(new_loop)
df = asyncio.run(get_messages())

Fetching Messages: 100%|██████████| 4000/4000 [00:39<00:00, 100.47message/s]


In [63]:
#removing messages with no text
df = df[df['text'] != '']

In [64]:
#I will filter out the earlier messages before Straits Times allowed reactions
df = df[df['date'] > '2022-07']

In [65]:
df['url'] = df['text'].str.extract('(http.*)')[0]
df = df[df['url'].notna()]

In [66]:
df['main_text'] = df['text'].str.split('\n')

In [67]:
df = df[(df['main_text'].apply(len) == 1) | (df['main_text'].apply(len) == 3) | (df['main_text'].apply(len) == 2) | (df['main_text'].apply(len) == 4) | (df['main_text'].apply(len) == 5)]

In [68]:
df['joined_text'] = df['main_text'].apply(lambda x: ' '.join(filter(None, x)))
df['joined_text'] = df['joined_text'].str.strip()

In [69]:
df['joined_text'][0]

'China reappoints Wang Yi as foreign minister, after removing Qin Gang. Mr Qin had been conspicuously absent from official duties for a month. https://str.sg/iiQA'

In [70]:
def remove_urls(text):
    return re.sub(r'http\S+', '', text).strip()

In [71]:
df['joined_text'] = df['joined_text'].apply(remove_urls)

In [82]:
df

Unnamed: 0,message_id,date,text,reactions,url,main_text,joined_text
0,11313,2023-07-25 11:46:02+00:00,"China reappoints Wang Yi as foreign minister, ...","[{'🤔': 271}, {'😱': 31}, {'👍': 15}, {'👎': 8}, {...",https://str.sg/iiQA,"[China reappoints Wang Yi as foreign minister,...","China reappoints Wang Yi as foreign minister, ..."
1,11312,2023-07-25 11:15:17+00:00,⚠️ Take note if you’re planning to cross into ...,"[{'👎': 140}, {'😱': 41}, {'👍': 36}, {'😢': 6}, {...",https://str.sg/iiQx,[⚠️ Take note if you’re planning to cross into...,⚠️ Take note if you’re planning to cross into ...
2,11311,2023-07-25 09:29:01+00:00,"A ""once in a blue moon"" occasion: There'll be ...","[{'👍': 241}, {'😱': 24}, {'👎': 11}, {'🤔': 9}, {...",https://str.sg/iiMC,"[A ""once in a blue moon"" occasion: There'll be...","A ""once in a blue moon"" occasion: There'll be ..."
3,11310,2023-07-25 08:01:57+00:00,Is the heat getting to you? While air-conditio...,"[{'👍': 159}, {'👎': 46}, {'😱': 21}, {'😢': 11}, ...",https://str.sg/iigC,[Is the heat getting to you? While air-conditi...,Is the heat getting to you? While air-conditio...
4,11309,2023-07-25 07:33:37+00:00,There are now 5 empty seats in Parliament. Do ...,"[{'👎': 288}, {'🤔': 94}, {'😱': 25}, {'👍': 23}, ...",https://str.sg/iiMX,[There are now 5 empty seats in Parliament. Do...,There are now 5 empty seats in Parliament. Do ...
...,...,...,...,...,...,...,...
3010,8262,2022-07-01 09:42:09+00:00,☀️ Singaporeans can expect a warmer and drier ...,"[{'👎': 377}, {'😱': 100}, {'😢': 67}, {'👍': 60},...",https://str.sg/wLLA,[☀️ Singaporeans can expect a warmer and drier...,☀️ Singaporeans can expect a warmer and drier ...
3011,8261,2022-07-01 05:49:19+00:00,"Pump prices have dipped across several brands,...","[{'👍': 301}, {'👎': 35}, {'🤔': 31}, {'😱': 15}, ...",https://str.sg/wLuR,[Pump prices have dipped across several brands...,"Pump prices have dipped across several brands,..."
3012,8260,2022-07-01 04:14:58+00:00,🚂 Any plans this weekend? How about a walk at ...,"[{'👍': 316}, {'🤔': 18}, {'👎': 11}, {'😱': 10}, ...",https://str.sg/wLuM,[🚂 Any plans this weekend? How about a walk at...,🚂 Any plans this weekend? How about a walk at ...
3013,8259,2022-07-01 03:06:01+00:00,"About 950,000 Singaporean households will rece...","[{'👍': 218}, {'👎': 31}, {'😢': 19}, {'🤔': 11}, ...",https://str.sg/wLuG,"[About 950,000 Singaporean households will rec...","About 950,000 Singaporean households will rece..."


In [83]:
# Convert from UTC to SGT (which is 'Asia/Singapore' in pytz)
df['date'] = df['date'].dt.tz_convert('Asia/Singapore')


# Convert datetime to string
df['time'] = df['date'].dt.strftime('%H:%M')
df['date'] = df['date'].dt.strftime('%Y-%m-%d')

In [91]:
def extract_shortened_url(url):
    pattern = r'\bhttps?://str\.sg/\w+\b'
    match = re.search(pattern, url)
    if match:
        return match.group()
    else:
        return url
    
df['url'] = df['url'].apply(extract_shortened_url)

In [96]:
df = df[~df['url'].str.len().isin([23, 27])]

In [97]:
df

Unnamed: 0,message_id,date,text,reactions,url,main_text,joined_text,time
0,11313,2023-07-25,"China reappoints Wang Yi as foreign minister, ...","[{'🤔': 271}, {'😱': 31}, {'👍': 15}, {'👎': 8}, {...",https://str.sg/iiQA,"[China reappoints Wang Yi as foreign minister,...","China reappoints Wang Yi as foreign minister, ...",19:46
1,11312,2023-07-25,⚠️ Take note if you’re planning to cross into ...,"[{'👎': 140}, {'😱': 41}, {'👍': 36}, {'😢': 6}, {...",https://str.sg/iiQx,[⚠️ Take note if you’re planning to cross into...,⚠️ Take note if you’re planning to cross into ...,19:15
2,11311,2023-07-25,"A ""once in a blue moon"" occasion: There'll be ...","[{'👍': 241}, {'😱': 24}, {'👎': 11}, {'🤔': 9}, {...",https://str.sg/iiMC,"[A ""once in a blue moon"" occasion: There'll be...","A ""once in a blue moon"" occasion: There'll be ...",17:29
3,11310,2023-07-25,Is the heat getting to you? While air-conditio...,"[{'👍': 159}, {'👎': 46}, {'😱': 21}, {'😢': 11}, ...",https://str.sg/iigC,[Is the heat getting to you? While air-conditi...,Is the heat getting to you? While air-conditio...,16:01
4,11309,2023-07-25,There are now 5 empty seats in Parliament. Do ...,"[{'👎': 288}, {'🤔': 94}, {'😱': 25}, {'👍': 23}, ...",https://str.sg/iiMX,[There are now 5 empty seats in Parliament. Do...,There are now 5 empty seats in Parliament. Do ...,15:33
...,...,...,...,...,...,...,...,...
3010,8262,2022-07-01,☀️ Singaporeans can expect a warmer and drier ...,"[{'👎': 377}, {'😱': 100}, {'😢': 67}, {'👍': 60},...",https://str.sg/wLLA,[☀️ Singaporeans can expect a warmer and drier...,☀️ Singaporeans can expect a warmer and drier ...,17:42
3011,8261,2022-07-01,"Pump prices have dipped across several brands,...","[{'👍': 301}, {'👎': 35}, {'🤔': 31}, {'😱': 15}, ...",https://str.sg/wLuR,[Pump prices have dipped across several brands...,"Pump prices have dipped across several brands,...",13:49
3012,8260,2022-07-01,🚂 Any plans this weekend? How about a walk at ...,"[{'👍': 316}, {'🤔': 18}, {'👎': 11}, {'😱': 10}, ...",https://str.sg/wLuM,[🚂 Any plans this weekend? How about a walk at...,🚂 Any plans this weekend? How about a walk at ...,12:14
3013,8259,2022-07-01,"About 950,000 Singaporean households will rece...","[{'👍': 218}, {'👎': 31}, {'😢': 19}, {'🤔': 11}, ...",https://str.sg/wLuG,"[About 950,000 Singaporean households will rec...","About 950,000 Singaporean households will rece...",11:06


In [98]:
df.to_excel('Dataset/thestraitstimes.xlsx', index=False)

In [101]:
import warnings
import urllib3

# Filter and ignore the InsecureRequestWarning
warnings.filterwarnings("ignore", category=urllib3.exceptions.InsecureRequestWarning)

In [105]:
def get_web_text(link):
    try:
        response = requests.get(link, headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'}, verify=False)
        response.raise_for_status()  # Raises an exception for 4xx or 5xx status codes

        soup = BeautifulSoup(response.text, 'lxml')
        para = soup.find_all("div", class_="clearfix text-formatted field field--name-field-paragraph-text field--type-text-long field--label-hidden field__item")
        p_contents = re.findall(r'<p>(.*?)</p>', str(para), re.DOTALL)
        for i in range(len(p_contents)):
            p_contents[i] = re.sub(r'<[^>]+>', '', p_contents[i])
        filtered_list = list(filter(None, p_contents))
        filtered_list_no_whitespace = [s.strip() for s in filtered_list]
        result_string = ' '.join(filtered_list_no_whitespace)
        return result_string
    except (requests.RequestException, ValueError) as e:
        print(f"Error occurred for URL: {link}")
        print(e)
        return None

In [107]:
from concurrent.futures import ThreadPoolExecutor
def process_links_with_multithreading(links):
    with ThreadPoolExecutor(max_workers=12) as executor:  # Set the number of threads you want to use
        # Use list comprehension to submit tasks to the executor
        results = [executor.submit(get_web_text, link) for link in links]
        
        # Get the results when the tasks are completed
        return [future.result() for future in results]

In [112]:
if __name__ == "__main__":
    # Replace links_list with the list of unique links in the 'url' column of df
    links_list = df['url'].unique().tolist()

    # Initialize the progress bar
    progress_bar = tqdm(total=len(links_list), desc='Processing URLs')

    def update_progress(future):
        # This function is called whenever a thread completes its task
        progress_bar.update(1)

    # Process links using multi-threading
    with ThreadPoolExecutor() as executor:
        # Submit tasks and use update_progress as a callback when tasks complete
        futures = [executor.submit(get_web_text, link) for link in links_list]
        for future in futures:
            future.add_done_callback(update_progress)

        # Get the results when the tasks are completed
        results = [future.result() for future in futures]

    # Create a dictionary to map URLs to their corresponding web content
    url_to_web_content = {link: content for link, content in zip(links_list, results)}

    # Update the 'WebText' column in the DataFrame with the web content
    df['web_text'] = df['url'].map(url_to_web_content)

    # Close the progress bar
    progress_bar.close()

Processing URLs:  24%|██▍       | 568/2335 [01:03<02:12, 13.37it/s]

Error occurred for URL: https://str.sg/iUmG
HTTPSConnectionPool(host='prdstaff.straitstimes.com', port=443): Max retries exceeded with url: /singapore/courts-crime/former-singapore-idol-judge-ken-lim-faces-5-more-charges-of-sexual-offences-against-3-women?utm_medium=social&utm_source=telegram&utm_campaign=sttg (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000226B6E14520>, 'Connection to prdstaff.straitstimes.com timed out. (connect timeout=None)'))


Processing URLs:  56%|█████▌    | 1302/2335 [02:25<01:44,  9.88it/s]

Error occurred for URL: https://str.sg/wyik
404 Client Error: Not Found for url: https://str.sg/wyik


Processing URLs: 100%|██████████| 2335/2335 [04:21<00:00,  8.92it/s]


In [120]:
df = df[~df['web_text'].isnull()]

In [116]:
df = df[df['web_text'] != '']

In [None]:
df.to_excel('Dataset/thestraitstimes.xlsx', index=False)