# This  script will scrape ajkaal.in a west indian news papper site.

In [1]:
import requests
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import requests
from urllib.parse import urljoin
import time
from datetime import datetime
import re
from PIL import Image
from io import BytesIO

In [2]:
# this is a internal function which will return soup file
def get_soup(url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    try:
        with webdriver.Chrome(options=chrome_options) as driver:
            driver.get(url)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            return soup
    except Exception as e:
        print(f"An error occurred: {e}")

In [3]:
# this function will try request file first. if doesn't work then selenium
def get_soup_page(url):
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')            
    except:
        soup = get_soup(url)
    return soup

In [4]:
bengali_to_arabic = {
    '০': '0',
    '১': '1',
    '২': '2',
    '৩': '3',
    '৪': '4',
    '৫': '5',
    '৬': '6',
    '৭': '7',
    '৮': '8',
    '৯': '9',
}

# Mapping for Bengali month names to English month names
bengali_month_to_english = {
    'জানুয়ারী': 'January',
    'ফেব্রুয়ারী': 'February',
    'মার্চ': 'March',
    'এপ্রিল': 'April',
    'মে': 'May',
    'জুন': 'June',
    'জুলাই': 'July',
    'আগস্ট': 'August',
    'সেপ্টেম্বর': 'September',
    'অক্টোবর': 'October',
    'নভেম্বর': 'November',
    'ডিসেম্বর': 'December',
}

def extract_date_time(text):
    # Regular expression pattern to match Bengali numerals
    bengali_pattern = re.compile(r'[০-৯]+')

    # Replace Bengali numerals with Arabic numerals
    text = bengali_pattern.sub(lambda x: ''.join([bengali_to_arabic[char] for char in x.group()]), text)

    # Regular expression pattern to match date and time information
    date_time_pattern = re.compile(r'(\d{1,2}\s*[জানুয়ারীফেব্রুয়ারীমার্চএপ্রিলমেজুনজুলাইআগস্টসেপ্টেম্বরঅক্টোবরনভেম্বরডিসেম্বর\s]+\s*\d{4}\s*\d{1,2}\s*:\s*\d{1,2})')

    # Find matches in the text
    match = date_time_pattern.search(text)

    if match:
        # Extract the matched date and time string
        date_time_str = match.group(1)

        # Clean up extra spaces
        date_time_str = ' '.join(date_time_str.split())

        # Map Bengali month names to English month names
        for bengali_month, english_month in bengali_month_to_english.items():
            date_time_str = date_time_str.replace(bengali_month, english_month)

        # Adjust the format for parsing
        formatted_date_time = datetime.strptime(date_time_str, "%d %B %Y %H : %M")
        formatted_date_time = formatted_date_time.strftime("%Y-%m-%d %H:%M")
        return formatted_date_time

    else:
        return "Not found"

In [5]:
url = "https://www.aajkaal.in"

In [6]:
soup = get_soup_page(url)
cat = soup.find_all("li", class_="nav-item")

In [7]:
catagory_links = []
catagory_links.append("/")
for c in cat:
    for tmp in c.find_all("a"):
        if "/news/" in tmp.get("href"):
            catagory_links.append(tmp.get("href"))
    

In [8]:
catagory_links

['/',
 '/news/1/kolkata',
 '/news/2/state',
 '/news/17/north_bengal',
 '/news/18/south_bengal',
 '/news/3/indianews',
 '/news/15/video',
 '/news/4/international',
 '/news/5/business',
 '/news/6/entertainment',
 '/news/7/sports',
 '/news/8/lifestyle',
 '/news/9/helth',
 '/news/10/education',
 '/news/11/opinion']

## getting all the catagory links done

In [31]:
main_Data = {} # Scraped Data

base_url = url
for cat in catagory_links: #out of main 6 category
    current_url = base_url+cat
    print(current_url)
    current_soup = get_soup_page(current_url)
    current_data = current_soup.find("div", class_="big__news")
    
    for c_data in current_data.find_all("a"): # all link in a single category.
        if "/story/" in c_data.get("href"):
            c_url = c_data.get("href")
            print(f"Extracting: : {c_url}")
            try:
                c_current_soup = get_soup_page(base_url+c_url)
                
                papar_name = "Aajkaal.in"
                source_link = base_url+c_url
                title = c_current_soup.find("div", class_="article__1--details").h1.text
                catagory_name = c_current_soup.find("div", class_="article__1--details").h5.a.text
                content = c_current_soup.find("div", class_="row mobile__width--90").p.text
                
                try:
                    writer, time = c_current_soup.find("div", class_="row mobile__width--90").h6.text.split("|")
                    time = extract_date_time(time)
                except:
                    print("writer and time could not seperate")
                    print("DEFAULT ACTION: Same value on both item")
                    writer = c_current_soup.find("div", class_="row mobile__width--90").h6.text
                    time = c_current_soup.find("div", class_="row mobile__width--90").h6.text
                
                try:
                    image = c_current_soup.find("div", class_="article__1 news__page").img.get("src")
                    respond = requests.get(url+image)
                    img = Image.open(BytesIO(respond.content))
                    image_path = "aajkaal.in/"+image.split("/")[-1]
                    img.save(image_path)
                except:
                    print("image not found")
                    print("DEFUALT ACTION")
                    image = "not found"
                    
                main_Data[title] = [papar_name, source_link, catagory_name, writer, time,  content, image_path]
                
            except:
                print("url not found, ", base_url+c_url)

https://www.aajkaal.in/
Extracting: : /story/5645/__earlier_i_was_the_hero_of_the_screen_now_the_hero_of_the_field__said_ferdous_ahmed_to_aajkaal_in
Extracting: : /story/5645/__earlier_i_was_the_hero_of_the_screen_now_the_hero_of_the_field__said_ferdous_ahmed_to_aajkaal_in
Extracting: : /story/5637/_19_women_and_14_minority_candidates_won_the_election_in_bangladesh
Extracting: : /story/5636/_47th_international_kolkata_book_fair_press_meet
image not found
DEFUALT ACTION
Extracting: : /story/5635/_who-wore-what-at-golden-globes-2024
Extracting: : /story/5633/akshay_kumar_supporting_bengal_warriors
image not found
DEFUALT ACTION
Extracting: : /story/5632/cm_launched_yogeshree_project
image not found
DEFUALT ACTION
Extracting: : /story/5631/dg_rajiv_kumar_on_sandeshkali
image not found
DEFUALT ACTION
Extracting: : /story/5623/kolkata_book_fair_will_be_hosted_in_bangladesh
Extracting: : /story/5622/bangladesh_book_fair
image not found
DEFUALT ACTION
Extracting: : /story/5620/adhir_ranjan_ch

Extracting: : /story/5547/minakshi_mukherjee_sent_buddhadeb_039_s_message_at_the_end_of_the_brigade_rally
Extracting: : /story/5543/salim-meenakshi_attack_the_center_and_state_together_from_the_brigade
Extracting: : /story/5534/sukanta_majumder_wrote_a_letter_to_the_governor_expressing_concern_about_the_incident_of_sandeshkhali
Extracting: : /story/5510/huge_crowd_in_kolkata_for_dyfi_brigade
Extracting: : /story/5477/_ira_khan-nupur_shikhare_indulged_in_exercise_even_before_marriage
Extracting: : /story/5475/injured_ed_officials_statement_record_
Extracting: : /story/5469/_14-day_ed_custody_of_shankar_adhya
Extracting: : /story/5461/_10_thousand_crores_in_ration_corruption_claims_ed
Extracting: : /story/5461/_10_thousand_crores_in_ration_corruption_claims_ed
Extracting: : /story/5443/admit_card_will_be_available_at_the_camp_office_
Extracting: : /story/5437/kolkata_airport_achieve_this_feat
Extracting: : /story/5365/some_trains_are_rerouted_some_trains_are_cancelled
Extracting: : /stor

Extracting: : /story/4750/royal_bengal_tiger_at_high_hill_area
Extracting: : /story/4735/manoj_malviya_appointed_as_state_police_advisory
Extracting: : /story/4728/peacock_died_due_to_car_accident
Extracting: : /story/4724/rajeev_kumar_is_new_dg_of_west_bengal_police
Extracting: : /story/4683/shootout_at_malda_youth_hospitalised
Extracting: : /story/4671/three_more_holidays_in_bengal_ni_act
Extracting: : /story/4624/chandranath_das_brings_himalayan_range_in_one_frame
Extracting: : /story/4588/aitc_youth_protest_march
Extracting: : /story/4578/decoity_in_malda_jewellery_shop
Extracting: : /story/4506/primary_tet_2023_exam
Extracting: : /story/4468/buxa_tiger_reserve_
Extracting: : /story/4348/_8_covid_affected_found_in_state_
Extracting: : /story/4184/bear_seen_at_dooars
Extracting: : /story/4175/techno_india_group_and_bengal_chamber_organised_inter_school_futsol_tournament
Extracting: : /story/4119/leopard_rescued_in_birpara_block_tea_garden
Extracting: : /story/4098/primary_teachers_m

Extracting: : /story/5511/teary_jet_airways_founder_naresh_goyal_
Extracting: : /story/5509/ashok_gehlot_sachin_pilot_in_congress_039_election_committee
Extracting: : /story/5508/leopard_kills_3-year-old_in_tamil_nadu_village_
Extracting: : /story/5499/_26_girls_missing_from_bhopal_shelter_home
Extracting: : /story/5466/aditya_l1_reaches_halo_orbit_near_sun
Extracting: : /story/5463/maharashtra_covid_task_force_orders_holiday_returnees_to_isolate_for_5_days
Extracting: : /story/5451/uddhav_thackeray_says_what_he_will_do
Extracting: : /story/5449/north_india_including_delhi_to_face_severe_cold
Extracting: : /story/5447/mahadev_app_cash_courier_stands_by_allegations_against_bhupesh_baghel
Extracting: : /story/5445/ambati_rayudu_quits_jagan_reddy_039_s_party_week_after_joining
Extracting: : /story/5440/_774_fresh_covid_cases_reported_in_india_today_
Extracting: : /story/5436/pm_modi_writes_to_japanese_counterpart_offers_assistance
Extracting: : /story/5435/has_full_faith_in_use_of_evms:_p

Extracting: : /story/5567/dawood_ibrahim_s_birthplace_was_sold_at_auction
Extracting: : /story/5566/bangladesh_election_sheikh_hasina_wins_for_the_fifth_time
Extracting: : /story/5564/israel_released_photo_of_hamas_military_chief_deif
Extracting: : /story/5562/bangladesh_election_boycotted_by_opposition_bnp_paltan_office_still_locked_
Extracting: : /story/5559/maldives_suspends_3_ministers_
Extracting: : /story/5555/bangladesh_counts_votes_after_election_
Extracting: : /story/5553/bangladesh_elections_2024_update
Extracting: : /story/5548/flight_crew_died_just_before_take_off
Extracting: : /story/5538/protesters_in_tel_aviv_outside_netanyahus_home_call_for_election_
Extracting: : /story/5533/president_joe_biden_warns_of_donald_trumps_threat_to_united_states
Extracting: : /story/5524/ukraine_says_russian_missile_attack_kills_11_
Extracting: : /story/5518/bangladesh_election_update
Extracting: : /story/5514/missile_attack_on_israel_039_s_air_base
Extracting: : /story/5506/sheikh_hasina_o

Extracting: : /story/5517/india_u-19_cricket_team_in_final_
Extracting: : /story/5500/why_bcci_is_taking_so_much_time_to_declare_squad_for_afghanistan_series_
Extracting: : /story/5498/mohun_bagan_avenue_to_be_inaugurated_in_jalpaiguri_on_11th_february_
Extracting: : /story/5474/bengal_post_409_runs_bowlers_take_three_wicket_to_put_bengal_on_top
Extracting: : /story/5474/bengal_post_409_runs_bowlers_take_three_wicket_to_put_bengal_on_top
Extracting: : /story/5472/the_champion_of_african_nations_cup_will_get_7_million_dollars
Extracting: : /story/5468/sunil_gavaskar_wants_to_see_virat_kohli_and_rohit_sharma_in_t20_world_cup_
Extracting: : /story/5455/two_bihar_ranji_trophy_teams_turn_up_on_ground_start_delayed_
Extracting: : /story/5431/brazilian_legend_mario_zagallo_dies_at_the_age_of_92
Extracting: : /story/5428/ind_woman_beat_aus_woman_in_9_wickets
Extracting: : /story/5420/australia_moved_to_the_top_spot_in_icc_test_rankings
Extracting: : /story/5419/competition_for_the_icc_best_cri

Extracting: : /story/4643/_5_tips_to_maintain_perfect_balance_between_festival_time_and_quality__039_us_039_time_
Extracting: : /story/4642/yoga_tips_to_elevate_mood_and_fight_seasonal_affective_disorder_
Extracting: : /story/4614/are_non-alcoholic_drinks_setting_the_party_trend_
Extracting: : /story/4612/to_boost_your_immunity_keep_these_superfood_on_your_diet
Extracting: : /story/4608/_chowman_begins_its_oriental_duck_festival’23
https://www.aajkaal.in/news/9/helth
Extracting: : /story/327/news
Extracting: : /story/327/news
Extracting: : /story/323/news
Extracting: : /story/320/news
Extracting: : /story/318/news
Extracting: : /story/313/news
Extracting: : /story/311/news
Extracting: : /story/309/news
Extracting: : /story/306/news
Extracting: : /story/306/news
Extracting: : /story/304/news
Extracting: : /story/302/news
Extracting: : /story/301/news
Extracting: : /story/299/news
Extracting: : /story/296/news
https://www.aajkaal.in/news/10/education
Extracting: : /story/5588/hospital_ma

In [32]:
main_Data

{'EXCLUSIVE: আগে পর্দার নায়ক ছিলাম, এবার মাঠের নায়ক... আজকাল ডট ইনকে বললেন ফেরদৌস': ['Aajkaal.in',
  'https://www.aajkaal.in/story/5645/__earlier_i_was_the_hero_of_the_screen_now_the_hero_of_the_field__said_ferdous_ahmed_to_aajkaal_in',
  'বিনোদন',
  'Reporter: তপশ্রী গুপ্ত ',
  '2024-01-09 00:43',
  'কালো কাচের টেবিল। আশেপাশে শুধু পুষ্পস্তবক। কালো রিভলভিং চেয়ারে সাদা পোশাক, সাদা শাল গায়ে বাংলাদেশের আওয়ামি লিগের নবনির্বাচিত সাংসদ ফেরদৌস আহমেদ। রাজনীতির ময়দানে তিনিই কি আগামীর লম্বা রেসের ঘোড়া? কেনই বা ২৫ বছরের বিনোদন কেরিয়ার ছেড়ে পর্দার নায়ক মাঠের নায়ক হলেন? উত্তর খুঁজতে ঢাকায় তাঁর মুখোমুখি তপশ্রী গুপ্তপ্রশ্ন: বিনোদন দুনিয়ায় দীর্ঘ ২৫ বছর, হঠাৎ কেন রাজনীতিতে?ফেরদৌস: ব্যাপারটা কিন্তু হঠাৎ হয়নি। যদিও আমার কেরিয়ারে হঠাৎ করে অনেক কিছু ঘটে গিয়েছে। যেমন, বাসু চট্টোপাধ্যায় পরিচালিত ‘হঠাৎ বৃষ্টি’ ছবি (হাল্কা হাসি)। ছাত্রাবস্থাতেই আমি কিন্তু রাজনীতির সঙ্গে যুক্ত ছিলাম। অবশ্যই সাংস্কৃতিক দিক থেকে। তখন হয়তো এরকম স্বপ্ন ছিল না। তখন স্বপ্ন ছিল নায়ক হব, অভিনেতা হব। ২০০১-এ প্রথম জাতীয