### Importing necessary libraries

In [1]:
from time import sleep
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import numpy as np
from tqdm import tqdm
from random import randint
import unicodedata

### Establishing connection with chromedriver

In [2]:
chromedriver_path = r'C:\Users\anwesh.gupta\Downloads\chromedriver_win32\chromedriver.exe'

In [3]:
driver = webdriver.Chrome(executable_path=chromedriver_path)
sleep(2)

### Taking input of data to be scraped

In [4]:
sources = []
destinations = []
print("enter -1 when done.")
print("-"*10)
while True:
    sources.append(input("From which city?\n"))
    if "-1" in sources: 
        sources.pop(-1)
        break
    destinations.append(input("Where to?\n"))
    if "-1" in destinations: 
        sources.pop(-1)
        destinations.pop(-1)
        break
    print("-"*10)

print("\nRoutes:")
for i in range(len(sources)):
    print(f"{sources[i]} => {destinations[i]}")

enter -1 when done.
----------
From which city?
BOM
Where to?
DEL
----------
From which city?
-1

Routes:
BOM => DEL


In [5]:
start_date = np.datetime64(input('Start Date, use YYYY-MM-DD format only '))
end_date = np.datetime64(input('End Date, use YYYY-MM-DD format only '))
days = end_date - start_date
num_days = days.item().days

Start Date, use YYYY-MM-DD format only 2022-01-01
End Date, use YYYY-MM-DD format only 2022-02-19


### defining functions to scrap several data columns 

In [6]:
def get_airlines(soup):
    airline = []
    airlines = soup.find_all('span',class_='codeshares-airline-names',text=True)
    for i in airlines:
        airline.append(i.text)
    return airline
    
def get_total_stops(soup):
    stops_list = []
    stops = soup.find_all('div',class_='section stops')

    for i in stops:
        for j in i.find_all('span',class_='stops-text'):
               stops_list.append(j.text)
    return stops_list

def get_duration(soup):
    duration_list = []
    duration = soup.find_all('div' , class_='section duration allow-multi-modal-icons')
    for i in duration:
        for j in i.find_all('div',class_='top'):
            duration_list.append(j.text)
    return duration_list

def get_dep_time(soup):
    deptime_list=[]
    deptime= soup.find_all('div', class_='section times')
    
    for i in deptime:
        for j in i.find_all('span', class_='depart-time base-time'):
            deptime_list.append(j.text)
    return deptime_list
        
def get_arr_time(soup):
    arrtime_list=[]
    arrtime= soup.find_all('div', class_='section times')
    
    for i in arrtime:
        for j in i.find_all('span', class_='arrival-time base-time'):
            arrtime_list.append(j.text)
    return arrtime_list

def get_price(soup):
    prices = []
    price = soup.find_all('div',class_='col-price result-column js-no-dtog')

    for i in price:
        for j in i.find_all('span', class_='price-text'):
            prices.append(j.text)
         
    for k in range(len(prices)):                                              # for k in range(len(prices)):
            prices[k]=unicodedata.normalize("NFKD",prices[k])                     #    prices[k].replace('₹','')
    return prices

### Appending all columns to one dataframe and saving it

In [7]:
for i in range(len(sources)):
    column_names = ["Airline", "Source", "Destination", "Dep time", "Arr time","Duration" ,"Total stops", "Date","Price"]
    df = pd.DataFrame(columns = column_names)
    for j in tqdm(range(num_days+1)):
        
        if j % 10 == 0:         #captcha err
            driver.quit()
            driver = webdriver.Chrome(chromedriver_path)
            
        url = f"https://www.kayak.co.in/flights/{sources[i]}-{destinations[i]}/{start_date+j}"
        driver.get(url)
        sleep(15)
        
        
        
        try:
            show_more_button = driver.find_element_by_xpath('//a[@class = "moreButton"]') #caution: open driver on screen 
        except:
            
            
            input("solve captcha then hit enter")
            
        while True:
            try:
                show_more_button.click()
                driver.implicitly_wait(15)       #sleep(randint(5,7))
            except:
                break
                
        sleep(5)       # click more done
    
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        airlines = get_airlines(soup)
        total_stops = get_total_stops(soup)
        dep_time= get_dep_time(soup)
        arr_time= get_arr_time(soup)
        duration = get_duration(soup)
        prices = get_price(soup)
        df = df.append(pd.DataFrame({
            'Airline': airlines,
            'Date' : start_date+j,
            'Dep time' : dep_time,
            'Arr time' : arr_time,
            'Duration': duration,
            'Total stops' : total_stops,
            'Price' : prices
        }))
        
        
    df['Source'] = sources[i]
    df['Destination'] = destinations[i]
    df = df.replace('₹','', regex=True)
    df = df.replace('\n','', regex=True)
    df = df.replace(' ','', regex=True)              #should hv used df['Price']
    df = df.replace(',','', regex=True)
    df = df.reset_index(drop = True)
    df["Price"]=df["Price"].astype(int)
    
    df.to_csv(f'{sources[i]}_{destinations[i]}.csv',index=False)
    print(f"Succesfully saved {sources[i]} => {destinations[i]} route as {sources[i]}_{destinations[i]}.csv ")
    
driver.quit()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [54:15<00:00, 65.10s/it]


Succesfully saved BOM => DEL route as BOM_DEL.csv 


In [11]:
df

Unnamed: 0,Airline,Source,Destination,Dep time,Arr time,Duration,Total stops,Date,Price
0,Vistara,BOM,DEL,20:55,23:00,2h05m,direct,2022-01-01,6244
1,Vistara,BOM,DEL,12:25,14:30,2h05m,direct,2022-01-01,6244
2,Vistara,BOM,DEL,06:00,08:05,2h05m,direct,2022-01-01,6244
3,Vistara,BOM,DEL,19:45,21:55,2h10m,direct,2022-01-01,6244
4,Vistara,BOM,DEL,11:55,14:05,2h10m,direct,2022-01-01,6244
...,...,...,...,...,...,...,...,...,...
4898,SpiceJetAlaskaSeaplanes,BOM,DEL,09:15,15:00,5h45m,1stop,2022-02-19,30061
4899,SpiceJet,BOM,DEL,15:30,22:10,6h40m,1stop,2022-02-19,31796
4900,SpiceJet,BOM,DEL,11:20,17:35,6h15m,1stop,2022-02-19,31879
4901,SpiceJetAlaskaSeaplanes,BOM,DEL,14:55,21:15,6h20m,1stop,2022-02-19,33259
