# Task 1: Data Scraping and Collection

This notebook demonstrates the implementation of the Telegram data scraping pipeline.

In [3]:
# Import necessary libraries
import os
import json
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
plt.style.use('seaborn-v0_8')

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# Display the directory structure created by the scraper
print("Data Lake Structure:")
!tree data/raw  # On Windows, you might need to use dir or ls depending on your environment

In [None]:
# Load sample data from the data lake
import glob

# Find the most recent date folder
date_folders = glob.glob('data/raw/telegram_messages/*')
if date_folders:
    latest_date = max(date_folders, key=os.path.getctime)
    print(f"Latest date folder: {latest_date}")
    
    # List JSON files in the latest date folder
    json_files = glob.glob(os.path.join(latest_date, '*.json'))
    print(f"JSON files found: {json_files}")
    
    if json_files:
        # Load the first JSON file as a sample
        with open(json_files[0], 'r', encoding='utf-8') as f:
            sample_data = json.load(f)
        
        print(f"\nSample data loaded. Number of messages: {len(sample_data)}")
        print("\nFirst message sample:")
        print(json.dumps(sample_data[0], indent=2))

In [None]:
# Show statistics about the scraped data
if 'sample_data' in locals():
    df = pd.DataFrame(sample_data)
    print("\nData Statistics:")
    print(df.describe())
    
    print("\nChannel Distribution:")
    print(df['channel_name'].value_counts())
    
    # Plot channel distribution
    plt.figure(figsize=(10, 6))
    df['channel_name'].value_counts().plot(kind='bar')
    plt.title('Distribution of Messages by Channel')
    plt.xlabel('Channel')
    plt.ylabel('Number of Messages')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Check the logs
import glob

log_files = glob.glob('logs/scrape_*.log')
if log_files:
    latest_log = max(log_files, key=os.path.getctime)
    print(f"\nContent of the latest log file ({latest_log}):")
    with open(latest_log, 'r', encoding='utf-8') as f:
        print(f.read())

In [None]:
# Show image downloads
import os

image_dirs = [d for d in os.listdir('data/raw/images') if os.path.isdir(os.path.join('data/raw/images', d))]
print(f"Channels with downloaded images: {image_dirs}")

for channel in image_dirs:
    images = os.listdir(os.path.join('data/raw/images', channel))
    print(f"{channel}: {len(images)} images downloaded")

## Summary

Task 1 successfully implemented a Telegram scraping pipeline that:
1. Extracts messages from specified Ethiopian medical channels
2. Downloads associated images
3. Stores data in a structured data lake format
4. Maintains proper logging

The data is stored in a partitioned structure by date and channel, making it suitable for further processing.