> # Necessary Imports

In [1]:
import pandas as pd
import logging
import os
import sys
sys.path.append('../')

from dotenv import load_dotenv
load_dotenv()

import sys
sys.path.append('../')

from scripts.scrapping import *
from scripts.setting import *
from scripts.cleaning import *

> # Telegram Access keys

In [2]:
api_id = os.getenv('TELEGRAM_API_ID')
api_hash = os.getenv('TELEGRAM_API_HASH')
phone = os.getenv('TELEGRAM_PHONE')

> # Logging

In [3]:
logging.basicConfig(filename='../logs/scraping.log', level=logging.INFO, 
                    format='%(asctime)s:%(levelname)s:%(message)s')

RAW_DATA_PATH = '../docs/raw/'
IMAGE_DATA_PATH = '../docs/images/'

> # Scrapping

In [8]:
import asyncio

async def main():
    await scrapping(logging, api_id, api_hash, RAW_DATA_PATH, IMAGE_DATA_PATH)

await main()

In [9]:
notebook_settings()

>> ## Scrapped Data

In [10]:
data = pd.read_csv('../docs/raw/all_scraped_messages.csv')
data[data['channel_name']=='CheMed123'].head()

Unnamed: 0,channel_name,message_id,date,text,image_path
10,CheMed123,97,2023-02-10 12:23:06,"⚠️**Notice!\n**Dear esteemed customers,\nDue to four-day motorbike movement restrictions, we have limited our services to matchmaking,drug information, and medical consultation only.\n\n**N.B. \n**🔅We will make deliveries for those for whom the medications are urgent.\n🔅You can book deliveries starting from Monday, February 13, 2023",../docs/images/photo_2024-10-15_19-10-15.jpg
11,CheMed123,96,2023-02-02 08:58:52,Mela-One በውስጡ ሆርሞን ያለው ድንገተኛ ወሊድ መቆጣጠርያ ሲሆን ያለመከላከያ የተደረገ የግብረስጋ ግንኙነት ሲኖር በ72 ሰዓታት ወስጥ መወሰድ ይኖርበታል።\n\n📌ከChe-Med ለማዘዝ፡ \nበቴሌግራም ግሩፓችን- t.me/CheMeds\nዌብሳይት- www.chemeds.org\nይዘዙን።,../docs/images/photo_2024-10-15_19-10-16.jpg
12,CheMed123,95,2023-02-01 08:59:37,**አዚትሮማይሲን** በሃኪም መድሃኒት ማዘዣ ከሚታዘዙ አንቲባዮቲኮች አንዱ ሲሆን በርከት ያሉ ባክቴርያዎችን ይገላል።\n\nበቀን አንዴ ለ3 ቀናት ምግብ ከመብላታችን 1 ሰዓት ቀደም ብሎ ወይንም ከበላን ከ 2 ሰዓት በኋላ መወሰድ ይኖርበታል።\n\n📌የሃኪም ማዘዣ ካልዎት \nበቴሌግራም ግሩፓችን-t.me/CheMeds\nዌብሳይት- www.chemeds.org\nይላኩልን።,../docs/images/photo_2024-10-15_19-10-17.jpg
13,CheMed123,94,2023-01-31 09:19:53,**Che-Med Trivia #3\n\n**ምግብና መጠጦች አንዳንድ መድሃኒቶች በደንብ እንዳይሰሩ ሊያደርጉ ይችላሉ። በዚህ ሁኔታ እነዚህን መድሃኒቶች ምግብ ከወሰድን ከ1 ሰዓት እስከ 2 ሰዓት ባለ ጊዜ ውስጥ መውሰድ ያስፈልግዎታል።\n\nከነዚህም ወስጥ **የእንቅርት መድሃኒቶች፣Omeprazole፣Sildenafil(Viagra)፣Captopril እና Ampicillin** ይጠቀሳሉ።\n\nመድሃኒትዎ ከምግብ ጋር ያለውን አወሳሰድ እርግጠኛ ካልሆኑ ሃኪምዎን ወይም ፋርማሲስትዎን ያማክሩ።,../docs/images/photo_2024-10-15_19-10-17 (1).jpg
14,CheMed123,93,2023-01-30 09:45:25,"**Che-Med Trivia #2\n\n**እንደ Ciprofloxacin, Doxycycline, Levothyroxine, Iron supplement ያሉ መድሃኒቶችን ከወተት እና ከወተት ተዋፅዖዎች ጋር እንዲሁም ከእንቁላል ጋር መውሰድ መድሃኒቶቹ በሰውነታችን የሚፈለገውን ስራ እንዳይሰሩ ያደርጋቸዋል።\n\nይህም በወተት ተዋጽኦዎች ውስጥ ያለው ካልሲየም እነዚህ መድሃኒቶች በሰውታችን ወስጥ እንዳይዋህዱ ጣልቃ ስለሚገባ ነው።",../docs/images/photo_2024-10-15_19-10-18.jpg


> # Cleaning the Data

In [11]:
cleaning(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['text'].fillna('No text', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['image_path'].fillna('No image', inplace=True)


>> ## Cleaned Data

In [12]:
cleaned_data = pd.read_csv('../docs/cleaned/all_scraped_messages_cleaned.csv')
cleaned_data.head()

Unnamed: 0,channel_name,message_id,date,text,image_path
0,DoctorsET,864,2023-12-18 17:04:02,"https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVDzL 👈👈\n\nበቀን አንዴ ብቻ የሚባለው የቢዝነስ አማካሪ \n\nበ 10,000 ብር ብቻ የተጀመረ ቢዝነስ እስከ ሚሊየኖች ያየገው የፕራግማ ኢንቨስትመንት መስራች መርድ ብስራት \n\nብላክ ማርኬት ዋጋ ማነው ሚተምነው ?\n\nኢትዮጵያ ውስጥ ቢዝነስ ለመጀመር ጥሩ ሰአት ነው ወይ ??\n\n3 ቢዝነስ ስንጀምር የምንሰራው ስህተቶች !\n\nአሁኑኑ ይመልከተቱ 👇👇\n\nhttps://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVDzL",No image
1,DoctorsET,863,2023-11-03 16:14:39,"ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከፋተኛ ዝግጅት በማድረግ ላይ ይገኛል ።\n\nዩሃንስ ግርማ ጆኒ ( በ ቲክቶክ ከ 500,000) በላይ ተከታይ ያገኘው ከ አሜሪካ ወደ ኢትዮጵያ \nገንዳ ውስጥ ከመተኛት እስከ ከፋተኛ ተከፋይ በኢትዮጵያ ( 1 take production ) በመባል የሚታወቀው ዩሃንስ ግርማ ( ጆኒ) \n\nከዲፕሬሽን እስከ ታዋቂነትን ያተረፈው ጆኒ \n\nአሁኑኑ ይመልከቱ 👇\n\nhttps://youtu.be/gwVN5eJQpko?si=xARsSxIEdZtE91GY",No image
2,DoctorsET,862,2023-10-02 16:37:39,ሞት በስኳር \n\nለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀንሰው ይሆን \n\nየታሸገ ትክክለኛ ጁስ ኢትዮጵያ ውስጥ የለም...!\n\nሁሉም ወላጅ ሊያየው የሚገባ\n\nየታሸጉ ጁሶች መጠጣት እራስን ለከፍተኛ ጉዳት እስከ ሞት ድረስ ማጋለጥ ነው !\n\nለሁሉም ኢኮኖሚ ጤናንም የሚጠብቅ አመጋገብ የቱ ነው??\n\nhttps://youtu.be/oHiSRrNF7I0?si=Absgm414YSt_kjNq,No image
3,DoctorsET,861,2023-09-16 07:54:32,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ?\n\nሙሉ ቪዲዮውን ተመልክተው ሃሳብ አስተያየቶን ያካፍሉን 👇👇👇👇\n\nhttps://youtu.be/tTeErZxIh_Q?si=jKHyfWcC3sfXbC8L,No image
4,DoctorsET,860,2023-09-01 16:16:15,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homosexuality ) ግብረ ሰዶም በተለያዩ ቦታዎች ላይ እየታየ ይገኛል\n\nይህም አጥብቀን ምንቃወመው ጉዳይ ሲሆን \n\n- ይህ ችግር በህምና ይለያል ወይ ?\n- ቅድመ ምልክት አለው ወይ ?\n\nወላጆች ልጆቻችሁን እንዴት መጠበቅ ትችላለቹ ህክምናስ አለው ወይ ??\n\nበምንመገበው ምግብ ሊቀይሩየን ይችላሉ ወይ ? \n\nአሁኑኑ ገብተው ሙሉ ቪዲዮውን ይመልከቱ !\n\nhttps://youtu.be/0k65P5ouw7s?si=qaUgo75bUa3AMQxD,No image


>> ## Store Cleaned Data

In [13]:
def store_cleaned_data(data, channel_name):
    cleaned_file_path = os.path.join(RAW_DATA_PATH, f'cleaned_{channel_name}.csv')
    data.to_csv(cleaned_file_path, index=False)
    logging.info(f'Stored cleaned data to {cleaned_file_path}')


In [14]:
import torch
import os
import cv2
import logging

# Setup logging
logging.basicConfig(filename='../logs/yolo_detection.log', level=logging.INFO,
                    format='%(asctime)s:%(levelname)s:%(message)s')

model = torch.hub.load('ultralytics/yolov5', 'yolov5s') 

IMAGE_DIR = '../docs/images/'
OUTPUT_DIR = '../docs/yolo_output/'

os.makedirs(OUTPUT_DIR, exist_ok=True)

for img_file in os.listdir(IMAGE_DIR):
    img_path = os.path.join(IMAGE_DIR, img_file)
    
    if img_file.endswith(('.jpg', '.jpeg', '.png')):
        img = cv2.imread(img_path)
        
        results = model(img)
        
        results.save(OUTPUT_DIR)
        
        logging.info(f'Processed {img_file} and saved results to {OUTPUT_DIR}')
        print(f'Detected objects in {img_file}')



Using cache found in /home/aman/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-10-15 Python-3.10.12 torch-2.4.1+cu121 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):
Saved 1 image to [1mruns/detect/exp2213[0m
  with amp.autocast(autocast):
Saved 1 image to [1mruns/detect/exp2214[0m
  with amp.autocast(autocast):


Detected objects in photo_2024-10-15_19-10-17.jpg
Detected objects in photo_2024-10-15_19-10-19.jpg


Saved 1 image to [1mruns/detect/exp2215[0m
  with amp.autocast(autocast):
Saved 1 image to [1mruns/detect/exp2216[0m
  with amp.autocast(autocast):


Detected objects in photo_2024-10-15_19-10-27.jpg
Detected objects in photo_2024-10-15_19-10-19 (1).jpg


Saved 1 image to [1mruns/detect/exp2217[0m
  with amp.autocast(autocast):
Saved 1 image to [1mruns/detect/exp2218[0m


Detected objects in photo_2024-10-15_19-10-29.jpg
Detected objects in photo_2024-10-15_19-10-19 (2).jpg


  with amp.autocast(autocast):
Saved 1 image to [1mruns/detect/exp2219[0m
  with amp.autocast(autocast):
Saved 1 image to [1mruns/detect/exp2220[0m
  with amp.autocast(autocast):


Detected objects in photo_2024-10-15_19-10-30.jpg
Detected objects in photo_2024-10-15_19-10-15.jpg


Saved 1 image to [1mruns/detect/exp2221[0m
  with amp.autocast(autocast):


Detected objects in photo_2024-10-15_19-10-28 (1).jpg


Saved 1 image to [1mruns/detect/exp2222[0m
  with amp.autocast(autocast):


Detected objects in photo_2024-10-15_19-10-25.jpg


Saved 1 image to [1mruns/detect/exp2223[0m
  with amp.autocast(autocast):
Saved 1 image to [1mruns/detect/exp2224[0m


Detected objects in photo_2024-10-15_19-10-23.jpg
Detected objects in photo_2024-10-15_19-10-28.jpg


  with amp.autocast(autocast):
Saved 1 image to [1mruns/detect/exp2225[0m
  with amp.autocast(autocast):
Saved 1 image to [1mruns/detect/exp2226[0m


Detected objects in photo_2024-10-15_19-10-20.jpg
Detected objects in photo_2024-10-15_19-10-26.jpg


  with amp.autocast(autocast):
Saved 1 image to [1mruns/detect/exp2227[0m
  with amp.autocast(autocast):


Detected objects in photo_2024-10-15_19-10-18.jpg


Saved 1 image to [1mruns/detect/exp2228[0m
  with amp.autocast(autocast):


Detected objects in photo_2024-10-15_19-10-29 (2).jpg


Saved 1 image to [1mruns/detect/exp2229[0m
  with amp.autocast(autocast):
Saved 1 image to [1mruns/detect/exp2230[0m
  with amp.autocast(autocast):


Detected objects in photo_2024-10-15_19-10-17 (1).jpg
Detected objects in photo_2024-10-15_19-10-29 (1).jpg


Saved 1 image to [1mruns/detect/exp2231[0m


Detected objects in photo_2024-10-15_19-10-16.jpg


In [16]:
IMAGE_DATA_PATH = "../docs/images/"
LABELS_PATH = "../docs/images/"
def load_labeled_data(all_messages_df):
    labeled_data = []
    
    # Loop through all files in the image directory to find labeled data
    for file_name in os.listdir(LABELS_PATH):
        if file_name.endswith('.txt'):  # Process label files
            image_file = file_name.replace('.txt', '.jpg')
            label_file = os.path.join(LABELS_PATH, file_name)
            image_file_path = os.path.join(LABELS_PATH, image_file)
            
            # Ensure the image exists
            if os.path.exists(image_file_path):
                # Read the image using OpenCV to get image size
                image = cv2.imread(image_file_path)
                image_height, image_width, _ = image.shape  # Extract image dimensions
                
                # Read label data from the .txt file
                with open(label_file, 'r') as f:
                    for line in f.readlines():
                        label_data = line.strip().split()
                        class_id, bbox_center_x, bbox_center_y, bbox_width, bbox_height = map(float, label_data)
                        
                        # Convert YOLO's normalized bbox data to actual pixel values
                        bbox_center_x = bbox_center_x * image_width
                        bbox_center_y = bbox_center_y * image_height
                        bbox_width = bbox_width * image_width
                        bbox_height = bbox_height * image_height
                        
                        # Find matching image in the original scraped DataFrame
                        message_metadata = all_messages_df[all_messages_df['image_path'] == image_file].iloc[0]
                        
                        # Append all relevant information to the labeled_data list
                        labeled_data.append({
                            'channel_name': message_metadata['channel_name'],
                            'message_id': message_metadata['message_id'],
                            'date': message_metadata['date'],
                            'text': message_metadata['text'],
                            'image_path': image_file,
                            'image_width': image_width,
                            'image_height': image_height,
                            'class_id': class_id,
                            'bbox_center_x': bbox_center_x,
                            'bbox_center_y': bbox_center_y,
                            'bbox_width': bbox_width,
                            'bbox_height': bbox_height
                        })

    # Convert the labeled data into a DataFrame for easy analysis
    df_labeled_data = pd.DataFrame(labeled_data)
    return df_labeled_data

all_messages_df = pd.read_csv('../docs/raw/all_scraped_messages.csv')
# Example usage (assuming 'all_messages_df' is the DataFrame that holds scraped message data):
df_labeled_data = load_labeled_data(all_messages_df)

# Display the labeled DataFrame with all attributes
print(df_labeled_data.head())

NameError: name 'LABELS_PATH' is not defined