In [5]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [15]:
def scrape_listings(url, marke):
    # Open the url
    r = requests.get(url)
    # Parse the html
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # Find all div-elements with class 
    listings = soup.find_all('div', {'class': 'ListItem_wrapper__J_a_C'})
    # Iterate over the listing
    rows = []
    for element in listings:
        row = {}
        # Find the a-element (model of car)
        headers = element.find('a', {'class': 'ListItem_title__znV2I ListItem_title_new_design__lYiAv Link_link__pjU1l'})
        row['url'] = 'https://www.autoscout24.de' + headers.get('href')
        model = headers.find('h2')
        span_elements = model.find_all('span')
        for span_element in span_elements:
            span_element.decompose() 
        row['Marke'] = marke
        row['Model'] = model.text.replace(marke, "")
        
        # Find all details about car
        details = element.find('div', {'class': 'VehicleDetailTable_container__mUUbY'})
        for detail in details:
            text = detail.text
            if(text.endswith('km')):
                row['Kilometerstand'] = text.split(' ')[0].replace('.', '')
            elif(text == "Automatik" or text == "Schaltgetriebe"):
                row['Antrieb'] = text
            elif(len(text.split('/')) == 2):
                row['Baujahr'] = text.split('/')[1]
            elif(text.endswith('PS)')):
                row['Motorleistung'] = text.split(' ')[0]
    
        # Find price of car
        price = element.find('p', {'class': 'Price_price__WZayw PriceAndSeals_current_price__XscDn'})
        if(price == None):
            continue
        first_index_find = price.text.find('€') + 2
        last_index_find = price.text.find(',')
        row['Preis'] = price.text[first_index_find : last_index_find]
        
        rows.append(row)
        
    return rows

In [7]:
def scrape_details(df):
    all_details = []  # Tüm detayları depolamak için bir liste
    
    # Iterate over the rows of df
    for i in range(len(df)):
        # Open the URL
        r = requests.get(df['url'].iloc[i])
        # Parse the HTML
        soup = BeautifulSoup(r.text, 'html.parser')
        # Find the div-element with class 'DetailPage_slicesContainer__wHHae false'
        sections = soup.find_all('section')
        details = {}
        for section in sections:
            detail = {}
            details_grid = section.find('div', {'class': 'DetailsSection_childrenSection__NQLD7'})
            if details_grid == None:
                continue
            # Bilgileri ekrana bastır
            dl_elements = details_grid.find_all('dl', {'class': 'DataGrid_defaultDlStyle__969Qm'})
            if(len(dl_elements) == 0):
                continue
            for dl in dl_elements:
                dt_elements = dl.find_all('dt', {'class': 'DataGrid_defaultDtStyle__yzRR_'})
                dd_elements = dl.find_all('dd', {'class': 'DataGrid_defaultDdStyle__29SKf'})
                for dt, dd in zip(dt_elements, dd_elements):
                    if(dt.text == "Baujahr" or dt.text == "Kilometerstand"):
                        continue
                    detail[dt.text] = dd.text
                
            details.update(detail)
        all_details.append(details)
    details_df = pd.DataFrame(all_details)
    df = pd.concat([df, details_df], axis=1)
    return df

## Data Preprocessing

In [8]:
def drop_features(df):
    feature_list = ['url', 'Marke', 'Model', 'Kilometerstand', 'Preis', 'Motorleistung',  
                'Antrieb', 'Baujahr', 'Sitzplätze', 'Türen', 'Hubraum', 'Außenfarbe']
    drop_list = [col for col in df.columns if col not in feature_list]
    df.drop(drop_list, inplace=True, axis=1)
    return df

In [10]:
def preprocessing(df):
    df["Preis"] = df["Preis"].astype(str).str.replace('.', '').astype('float')
    df["Hubraum"] = df["Hubraum"].astype(str).str.replace(' cm³', '').astype('float')
    df["Hubraum"] = df["Hubraum"].astype(str).str.replace('.', '')
    return df

### Audi Cars Scraping

In [56]:
audi_urls = [
    "https://www.autoscout24.de/lst/audi?atype=C&cy=D&damaged_listing=exclude&desc=0&fregfrom=2010&fregto=2023&ocs_listing=include&powertype=kw&search_id=sphlth5ci4&sort=standard&source=homepage_search-mask&ustate=N%2CU%21",
    "https://www.autoscout24.de/lst/audi?atype=C&cy=D&damaged_listing=exclude&desc=0&fregfrom=2010&fregto=2023&ocs_listing=include&page=2&powertype=kw&search_id=sphlth5ci4&sort=standard&source=listpage_pagination&ustate=N%2CU%21",
    "https://www.autoscout24.de/lst/audi?atype=C&cy=D&damaged_listing=exclude&desc=0&fregfrom=2010&fregto=2023&ocs_listing=include&page=3&powertype=kw&search_id=sphlth5ci4&sort=standard&source=listpage_pagination&ustate=N%2CU%21",
    "https://www.autoscout24.de/lst/audi?atype=C&cy=D&damaged_listing=exclude&desc=0&fregfrom=2010&fregto=2023&ocs_listing=include&page=4&powertype=kw&search_id=sphlth5ci4&sort=standard&source=listpage_pagination&ustate=N%2CU%21",
    "https://www.autoscout24.de/lst/audi?atype=C&cy=D&damaged_listing=exclude&desc=0&fregfrom=2010&fregto=2023&ocs_listing=include&page=4&powertype=kw&search_id=sphlth5ci4&sort=standard&source=listpage_pagination&ustate=N%2CU%21",
    "https://www.autoscout24.de/lst/audi?atype=C&cy=D&damaged_listing=exclude&desc=0&fregfrom=2010&fregto=2023&ocs_listing=include&page=5&powertype=kw&search_id=sphlth5ci4&sort=standard&source=listpage_pagination&ustate=N%2CU%21"
            ]

In [64]:
def scraping_audi(url_list):
    merged_df = pd.DataFrame()
    for url in url_list:
        df = pd.DataFrame(scrape_listings(url), "Audi")
        merged_df = pd.concat([merged_df, df], axis=0, ignore_index=True)
    audi_details_df = scrape_details(merged_df)    
    audi_details_df = drop_features(audi_details_df)
    audi_details_df = preprocessing(audi_details_df)
    audi_details_df.to_csv("audi.csv")
    return audi_details_df

In [65]:
all_audi_cars = scraping_audi(audi_urls)

In [66]:
all_audi_cars.tail()

Unnamed: 0,url,Marke,Model,Kilometerstand,Antrieb,Baujahr,Motorleistung,Preis,Sitzplätze,Türen,Hubraum,Außenfarbe
106,https://www.autoscout24.de/angebote/audi-a3-li...,Audi,A3,6000,Automatik,2022,140,48980.0,5,5,1984,Weiß
107,https://www.autoscout24.de/angebote/audi-q5-40...,Audi,Q5,124750,Automatik,2019,140,28989.0,5,5,1968,Schwarz
108,https://www.autoscout24.de/angebote/audi-a5-sp...,Audi,A5,67000,Automatik,2019,140,28300.0,5,4,1984,Weiß
109,https://www.autoscout24.de/angebote/audi-a3-1-...,Audi,A3,124200,Schaltgetriebe,2015,81,11950.0,5,3,1197,Schwarz
110,https://www.autoscout24.de/angebote/audi-s3-2-...,Audi,S3,87913,Automatik,km),221,23360.0,5,3,1984,Grau


### Volkswagen Cars Scraping

In [11]:
vw_urls = [
    "https://www.autoscout24.de/lst/volkswagen?atype=C&cy=D&desc=0&fregfrom=2010&ocs_listing=include&sort=standard&source=homepage_search-mask&ustate=N%2CU",
    "https://www.autoscout24.de/lst/volkswagen?atype=C&cy=D&desc=0&fregfrom=2010&ocs_listing=include&page=2&search_id=16n7t8os3h3&sort=standard&source=listpage_pagination&ustate=N%2CU",
    "https://www.autoscout24.de/lst/volkswagen?atype=C&cy=D&desc=0&fregfrom=2010&ocs_listing=include&page=3&search_id=16n7t8os3h3&sort=standard&source=listpage_pagination&ustate=N%2CU",
    "https://www.autoscout24.de/lst/volkswagen?atype=C&cy=D&desc=0&fregfrom=2010&ocs_listing=include&page=4&search_id=16n7t8os3h3&sort=standard&source=listpage_pagination&ustate=N%2CU",
    "https://www.autoscout24.de/lst/volkswagen?atype=C&cy=D&desc=0&fregfrom=2010&ocs_listing=include&page=5&search_id=16n7t8os3h3&sort=standard&source=listpage_pagination&ustate=N%2CU",
    "https://www.autoscout24.de/lst/volkswagen?atype=C&cy=D&desc=0&fregfrom=2010&ocs_listing=include&page=6&search_id=16n7t8os3h3&sort=standard&source=listpage_pagination&ustate=N%2CU"
]

In [18]:
def scraping_vw(url_list):
    merged_df = pd.DataFrame()
    for url in url_list:
        df = pd.DataFrame(scrape_listings(url, "Volkswagen"))
        merged_df = pd.concat([merged_df, df], axis=0, ignore_index=True)
    vw_details_df = scrape_details(merged_df)    
    vw_details_df = drop_features(vw_details_df)
    vw_details_df = preprocessing(vw_details_df)
    vw_details_df.to_csv("volkswagen.csv")
    return vw_details_df

In [19]:
all_vw_cars = scraping_vw(vw_urls)

In [20]:
all_vw_cars.head()

Unnamed: 0,url,Marke,Model,Kilometerstand,Antrieb,Baujahr,Motorleistung,Preis,Sitzplätze,Türen,Hubraum,Außenfarbe
0,https://www.autoscout24.de/angebote/volkswagen...,Volkswagen,Beetle,135970,Schaltgetriebe,2014,77,9850.0,4,3,1197,Schwarz
1,https://www.autoscout24.de/angebote/volkswagen...,Volkswagen,CC,144589,Automatik,2014,118,10440.0,5,4,139,Braun
2,https://www.autoscout24.de/angebote/volkswagen...,Volkswagen,Passat Alltrack,292578,Schaltgetriebe,2014,103,7980.0,5,5,1968,Blau
3,https://www.autoscout24.de/angebote/volkswagen...,Volkswagen,Golf,68820,Schaltgetriebe,2012,63,8999.0,5,4,1197,Blau
4,https://www.autoscout24.de/angebote/volkswagen...,Volkswagen,Golf Variant,117019,Schaltgetriebe,2010,77,7991.0,5,5,1197,Blau
