In [None]:
from bs4 import BeautifulSoup as bs
import requests
import logging
from collections import defaultdict
import pandas as pd
import html
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'DNT': '1',  # Do Not Track request header
    'Cache-Control': 'no-cache',
    'Pragma': 'no-cache'
}

def call_xml(lnk: str, retries: int = 3) -> bs:
    attempt = 0
    while attempt < retries:
        try:
            source = requests.get(lnk,headers=headers, timeout=10)
            source.encoding = 'utf-8'
            if source.status_code == 200:
                logging.info(f"Successfully fetched XML from {lnk}")
                return bs(source.text, 'xml')
            else:
                logging.error(f"Error fetching XML from {lnk}: Status Code {source.status_code}")
                return None
        except requests.exceptions.RequestException as e:
            logging.error(f"Error fetching XML from {lnk}: {e}")
            attempt += 1
            if attempt < retries:
                logging.info(f"Retrying... ({attempt}/{retries})")
                time.sleep(2)  # Wait for 2 seconds before retrying
    return None


def process_items(items: list) -> pd.DataFrame:
    data = defaultdict(list)

    for item in items:
        try:
            title = html.unescape(item.find('Title').text)
            link = item.find('Link').text
            category = item.find('CategoryName').text
            pubdate = item.find('PublishDate').text
            existing_tags = item.find('Tags').text
            article_id = item.find('Guid').text
            image_url = item.find('ThumbImage').text


            # article_id = re.findall(r'(\d+)\.html', link)
            # article_id = article_id[0] if article_id else "unknown"

            # category_elem, article_id_elem = extract_info(link)
            # weblink_elem, description, existing_tags = fetch_additional_data(category_elem, article_id)
            content_elem = call_xml(link).find('Content').text
            clean_content = re.sub(r"<blockquote.*?>.*?<\/blockquote>|<script.*?>.*?<\/script>|<a.*?>.*?<\/a>|<.*?>|(&[^;]+;)", "", html.unescape(content_elem)).replace('\\', " ")
            content = re.sub(r'[\s]{2,}',' ',clean_content)
            

            data['PublishDate'].append(pubdate)
            data['Title'].append(title)
            data['WebURL'].append(link)
            data['ImageURL'].append(image_url)
            # data['XMLURL'].append(weblink_elem)
            # data['Summary'].append(description)
            data['Content'].append(content)
            data['CategoryName'].append(category)
            data['ExistingTags'].append(existing_tags)
            data['ArticleID'].append(article_id)

        except Exception as e:
            logging.error(f"Error processing item: {e}")

    return pd.DataFrame(data)

def fetch_and_process_page(page_number):
    main_link = f"https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=www.oneindia.com&limit=30&page={page_number}"
    root = call_xml(main_link)
    if root:
        items = root.find_all('Item')
        if items:
            return process_items(items)
    else:
        logging.error(f"Failed to fetch the XML feed for page {page_number}")
    return pd.DataFrame()

if __name__ == "__main__":
    all_data_df = pd.DataFrame()
    max_workers = 10 

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(fetch_and_process_page, i) for i in range(14000)] #1181
        for future in as_completed(futures):
            data_df = future.result()
            if not data_df.empty:
                all_data_df = pd.concat([all_data_df, data_df], ignore_index=True)

            if len(all_data_df) % 3000 == 0:  # Save every 3000 rows
                all_data_df.to_csv('english_image_dataset.csv', index=False, encoding='utf-8')
                logging.info(f"Saved DataFrame to 'english_image_dataset.csv' with {len(all_data_df)} rows")

    # Final save
    all_data_df.to_csv('english_image_dataset.csv', index=False, encoding='utf-8')
    logging.info(f"Final DataFrame saved to 'english_image_dataset.csv' with {len(all_data_df)} rows")

2024-11-19 15:19:16,044 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=www.oneindia.com&limit=30&page=3
2024-11-19 15:19:16,123 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=www.oneindia.com&limit=30&page=6
2024-11-19 15:19:16,143 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=www.oneindia.com&limit=30&page=1
2024-11-19 15:19:16,144 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=www.oneindia.com&limit=30&page=4
2024-11-19 15:19:16,146 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=www.oneindia.com&limit=30&page=2
2024-11-19 15:19:16,153 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?ty

## Test_data_extraction


In [64]:
from bs4 import BeautifulSoup as bs
import requests
import logging
from collections import defaultdict
import pandas as pd
import html
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'DNT': '1',  # Do Not Track request header
    'Cache-Control': 'no-cache',
    'Pragma': 'no-cache'
}

def call_xml(lnk: str, retries: int = 3) -> bs:
    attempt = 0
    while attempt < retries:
        try:
            source = requests.get(lnk,headers=headers, timeout=10)
            source.encoding = 'utf-8'
            if source.status_code == 200:
                logging.info(f"Successfully fetched XML from {lnk}")
                return bs(source.text, 'xml')
            else:
                logging.error(f"Error fetching XML from {lnk}: Status Code {source.status_code}")
                return None
        except requests.exceptions.RequestException as e:
            logging.error(f"Error fetching XML from {lnk}: {e}")
            attempt += 1
            if attempt < retries:
                logging.info(f"Retrying... ({attempt}/{retries})")
                time.sleep(2)  # Wait for 2 seconds before retrying
    return None


def process_items(items: list) -> pd.DataFrame:
    data = defaultdict(list)

    for item in items:
        try:
            title = html.unescape(item.find('Title').text)
            link = item.find('Link').text
            category = item.find('CategoryName').text
            pubdate = item.find('PublishDate').text
            existing_tags = item.find('Tags').text
            article_id = item.find('Guid').text

            # article_id = re.findall(r'(\d+)\.html', link)
            # article_id = article_id[0] if article_id else "unknown"

            # category_elem, article_id_elem = extract_info(link)
            # weblink_elem, description, existing_tags = fetch_additional_data(category_elem, article_id)
            content_elem = call_xml(link).find('Content').text
            clean_content = re.sub(r"<blockquote.*?>.*?<\/blockquote>|<script.*?>.*?<\/script>|<a.*?>.*?<\/a>|<.*?>|(&[^;]+;)", "", html.unescape(content_elem)).replace('\\', " ")
            content = re.sub(r'[\s]{2,}',' ',clean_content)

            data['PublishDate'].append(pubdate)
            data['Title'].append(title)
            data['WebURL'].append(link)
            # data['XMLURL'].append(weblink_elem)
            # data['Summary'].append(description)
            data['Content'].append(content)
            data['CategoryName'].append(category)
            data['ExistingTags'].append(existing_tags)
            data['ArticleID'].append(article_id)

        except Exception as e:
            logging.error(f"Error processing item: {e}")

    return pd.DataFrame(data)

def fetch_and_process_page(page_number):
    main_link = f"https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=telugu.nativeplanet.com&limit=30&page={page_number}"
    root = call_xml(main_link)
    if root:
        items = root.find_all('Item')
        if items:
            return process_items(items)
    else:
        logging.error(f"Failed to fetch the XML feed for page {page_number}")
    return pd.DataFrame()

if __name__ == "__main__":
    all_data_df = pd.DataFrame()
    max_workers = 10 

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(fetch_and_process_page, i) for i in range(1,5)]
        for future in as_completed(futures):
            data_df = future.result()
            if not data_df.empty:
                all_data_df = pd.concat([all_data_df, data_df], ignore_index=True)

            if len(all_data_df) % 3000 == 0:  # Save every 3000 rows
                all_data_df.to_csv('C:/Users/Greynium/Desktop/Important Documents/datacollection/test data/NP/telugu_test_dataset.csv', index=False, encoding='utf-8')
                logging.info(f"Saved DataFrame to 'datacollection/test data/NP/telugu_test_dataset.csv' with {len(all_data_df)} rows")

    # Final save
    all_data_df.to_csv('C:/Users/Greynium/Desktop/Important Documents/datacollection/test data/NP/telugu_test_dataset.csv', index=False, encoding='utf-8')
    logging.info(f"Final DataFrame saved to 'datacollection/test data/NP/telugu_test_dataset.csv' with {len(all_data_df)} rows")

2024-12-09 12:37:48,528 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=telugu.nativeplanet.com&limit=30&page=2
2024-12-09 12:37:48,578 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=telugu.nativeplanet.com&limit=30&page=3
2024-12-09 12:37:48,583 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=telugu.nativeplanet.com&limit=30&page=1
2024-12-09 12:37:48,585 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=telugu.nativeplanet.com&limit=30&page=4
2024-12-09 12:37:49,319 - ERROR - Error fetching XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&subtype=article&site=telugu.nativeplanet.com&path=news&id=6371: Status Code 500
2024-12-09 12:37:49,326 - ERROR - Error processing item: 'NoneT

## 5010 dataset

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import logging
from collections import defaultdict
import pandas as pd
import html
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'DNT': '1',  # Do Not Track request header
    'Cache-Control': 'no-cache',
    'Pragma': 'no-cache'
}

def call_xml(lnk: str, retries: int = 3) -> bs:
    attempt = 0
    while attempt < retries:
        try:
            source = requests.get(lnk,headers=headers, timeout=10)
            source.encoding = 'utf-8'
            if source.status_code == 200:
                logging.info(f"Successfully fetched XML from {lnk}")
                return bs(source.text, 'xml')
            else:
                logging.error(f"Error fetching XML from {lnk}: Status Code {source.status_code}")
                return None
        except requests.exceptions.RequestException as e:
            logging.error(f"Error fetching XML from {lnk}: {e}")
            attempt += 1
            if attempt < retries:
                logging.info(f"Retrying... ({attempt}/{retries})")
                time.sleep(2)  # Wait for 2 seconds before retrying
    return None


def process_items(items: list) -> pd.DataFrame:
    data = defaultdict(list)

    for item in items:
        try:
            title = html.unescape(item.find('Title').text)
            link = item.find('Link').text
            category = item.find('CategoryName').text
            pubdate = item.find('PublishDate').text
            existing_tags = item.find('Tags').text
            article_id = item.find('Guid').text

            # article_id = re.findall(r'(\d+)\.html', link)
            # article_id = article_id[0] if article_id else "unknown"

            # category_elem, article_id_elem = extract_info(link)
            # weblink_elem, description, existing_tags = fetch_additional_data(category_elem, article_id)
            content_elem = call_xml(link).find('Content').text
            clean_content = re.sub(r"<blockquote.*?>.*?<\/blockquote>|<script.*?>.*?<\/script>|<a.*?>.*?<\/a>|<.*?>|(&[^;]+;)", "", html.unescape(content_elem)).replace('\\', " ")
            content = re.sub(r'[\s]{2,}',' ',clean_content)
            

            data['PublishDate'].append(pubdate)
            data['Title'].append(title)
            data['WebURL'].append(link)
            # data['XMLURL'].append(weblink_elem)
            # data['Summary'].append(description)
            data['Content'].append(content)
            data['CategoryName'].append(category)
            data['ExistingTags'].append(existing_tags)
            data['ArticleID'].append(article_id)

        except Exception as e:
            logging.error(f"Error processing item: {e}")

    return pd.DataFrame(data)

def fetch_and_process_page(page_number):
    main_link = f"https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=kannada.drivespark.com&limit=30&page={page_number}"
    root = call_xml(main_link)
    if root:
        items = root.find_all('Item')
        if items:
            return process_items(items)
    else:
        logging.error(f"Failed to fetch the XML feed for page {page_number}")
    return pd.DataFrame()

if __name__ == "__main__":
    all_data_df = pd.DataFrame()
    max_workers = 10 

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(fetch_and_process_page, i) for i in range(2,335)]#168 52
        for future in as_completed(futures):
            data_df = future.result()
            if not data_df.empty:
                all_data_df = pd.concat([all_data_df, data_df], ignore_index=True)

            if len(all_data_df) % 3000 == 0:  # Save every 3000 rows
                all_data_df.to_csv('./domains_data/Drivespark/kannada_drivespark_dataset.csv', index=False, encoding='utf-8')
                logging.info(f"Saved DataFrame to './domains_data/Drivespark/kannada_drivespark_dataset.csv' with {len(all_data_df)} rows")
    # Final save
    all_data_df.to_csv('./domains_data/Drivespark/kannada_drivespark_dataset.csv', index=False, encoding='utf-8')
    logging.info(f"Final DataFrame saved to './domains_data/Drivespark/kannada_drivespark_dataset.csv' with {len(all_data_df)} rows")

2024-11-14 09:34:54,549 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=kannada.drivespark.com&limit=30&page=2
2024-11-14 09:34:54,587 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=kannada.drivespark.com&limit=30&page=4
2024-11-14 09:34:54,604 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=kannada.drivespark.com&limit=30&page=5
2024-11-14 09:34:54,669 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=kannada.drivespark.com&limit=30&page=6
2024-11-14 09:34:54,721 - INFO - Successfully fetched XML from https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=kannada.drivespark.com&limit=30&page=8
2024-11-14 09:34:54,723 - INFO - Successfully fetched XML from https://rss.oneindia.c

In [16]:
import re

match = re.search(r'site=(\w+)',"https://rss.oneindia.com/scripts/cms/newsFeed.php?type=dh-feed&sub_type=all&site=www.oneindia.com&limit=30&page=11")

if match.group(1)=='www':
    language = 'english'
else:
    language = match.group(1)

language

'english'

In [11]:
sentence = "this is a dog.  there is additional space."
sentence = sentence.replace('  ',' ')
print(sentence)


this is a dog. there is additional space.
