# Scrapping first website

**Beautiful Soup** is a Python library for pulling data out of HTML and XML files.

In [38]:
import pandas as pd
from random import randint
from time import sleep
import os.path

#from google.colab import drive
#drive.mount('/content/drive')

brand_file_exists = os.path.exists('../datasets/cars-data/brands.csv')
model1_file_exists = os.path.exists('../datasets/cars-data/models-1.csv')
model2_file_exists = os.path.exists('../datasets/cars-data/models-2.csv')
modelType_file_exists = os.path.exists('../datasets/cars-data/modelTypes.csv')
rawData_file_exists = os.path.exists('../datasets/cars-data/raw-data.csv')

scrapData = 0 # enable=1 or disable=0 to scrap the data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
import requests
from bs4 import BeautifulSoup

Source = 'cars-data.com'
CarDataURL = "https://www.cars-data.com"
page = requests.get(CarDataURL)

soup = BeautifulSoup(page.content, "html.parser")

In [23]:
def beautiful_soup(url):
    page = requests.get(url)
    return BeautifulSoup(page.content, "html.parser")

- '\A\s+|\s+\Z' -> '' will act like strip() removing all leading and trailing whitespace:
  - \A\s+ - matches 1 or more whitespace symbols at the start of the string
  - | - or
  - \s+\Z - matches 1 or more whitespace symbols at the end of the string
- '\n' -> ' ' will replace any newline with a space.

In [24]:
def clean_data(df_text):
    return df_text.replace({ r'\A\s+|\s+\Z': '', '\n' : ' '}, regex=True, inplace=True)

**Function to verify a valid URL**

In [25]:
from urllib.parse import urlparse

def is_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

In [26]:
results = soup.find("div", class_="footerbrands")
print(results.prettify())


<div class="row footerbrands">
 <a class="a_footer" href="/en/abarth" title="Abarth">
  Abarth
 </a>
 <a class="a_footer" href="/en/aiways" title="Aiways">
  Aiways
 </a>
 <a class="a_footer" href="/en/alfa-romeo" title="Alfa Romeo">
  Alfa Romeo
 </a>
 <a class="a_footer" href="/en/alpine" title="Alpine">
  Alpine
 </a>
 <a class="a_footer" href="/en/asia-motors" title="Asia Motors">
  Asia Motors
 </a>
 <a class="a_footer" href="/en/aston-martin" title="Aston Martin">
  Aston Martin
 </a>
 <a class="a_footer" href="/en/audi" title="Audi">
  Audi
 </a>
 <a class="a_footer" href="/en/austin" title="Austin">
  Austin
 </a>
 <a class="a_footer" href="/en/autobianchi" title="Autobianchi">
  Autobianchi
 </a>
 <a class="a_footer" href="/en/bentley" title="Bentley">
  Bentley
 </a>
 <a class="a_footer" href="/en/bmw" title="BMW">
  BMW
 </a>
 <a class="a_footer" href="/en/bugatti" title="Bugatti">
  Bugatti
 </a>
 <a class="a_footer" href="/en/buick" title="Buick">
  Buick
 </a>
 <a class="

# 1. Fetch all the brands

In [27]:
if(brand_file_exists == True and scrapData == 0):
    CarsData_brands_df = pd.read_csv ('../datasets/cars-data/brands.csv')
else:
    CarsData_brands_df = pd.DataFrame(columns=['make','InnerLink'])

    brands = results.find_all("a", class_="a_footer")
    for i, brand in enumerate(brands):
        inner_url = brand['href'] if (is_url(brand['href']) == True) else CarDataURL+brand['href']

        CarsData_brands_df.loc[i, ['make']] = brand.text.strip()
        CarsData_brands_df.loc[i, ['InnerLink']] = inner_url



In [28]:
display(CarsData_brands_df)

# save brands to CSV file after scrapping
if(scrapData == 1):
    CarsData_brands_df.to_csv('../datasets/cars-data/brands.csv', encoding='utf-8', index=False)

Unnamed: 0,make,InnerLink
0,Abarth,https://www.cars-data.com/en/abarth
1,Aiways,https://www.cars-data.com/en/aiways
2,Alfa Romeo,https://www.cars-data.com/en/alfa-romeo
3,Alpine,https://www.cars-data.com/en/alpine
4,Asia Motors,https://www.cars-data.com/en/asia-motors
...,...,...
89,Triumph,https://www.cars-data.com/en/triumph
90,TVR,https://www.cars-data.com/en/tvr
91,Volkswagen,https://www.cars-data.com/en/volkswagen
92,Volvo,https://www.cars-data.com/en/volvo


In [30]:
if(model1_file_exists == True and scrapData == 0):
    CarsData_models_df = pd.read_csv ('../datasets/cars-data/models-1.csv')
else:
    CarsData_models_df = pd.DataFrame(columns=['make','model','InnerLink'])

    for b, CD_brand in CarsData_brands_df.iterrows():
        print(CD_brand['make'], CD_brand['InnerLink'])
        m_soup = beautiful_soup(CD_brand['InnerLink'])
        results = m_soup.find("section", class_="models")

        models = results.find_all("a")
        for i, model in enumerate(models):
            inner_url = model['href'] if (is_url(model['href']) == True) else CarDataURL+model['href']

            CarsData_models_df.loc[b, ['make']] = CD_brand['make']
            CarsData_models_df.loc[b, ['model']] = model.text.strip()
            CarsData_models_df.loc[b, ['InnerLink']] = inner_url

        sleep(randint(3,7))

Abarth https://www.cars-data.com/en/abarth
Aiways https://www.cars-data.com/en/aiways
Alfa Romeo https://www.cars-data.com/en/alfa-romeo
Alpine https://www.cars-data.com/en/alpine
Asia Motors https://www.cars-data.com/en/asia-motors
Aston Martin https://www.cars-data.com/en/aston-martin
Audi https://www.cars-data.com/en/audi
Austin https://www.cars-data.com/en/austin
Autobianchi https://www.cars-data.com/en/autobianchi
Bentley https://www.cars-data.com/en/bentley
BMW https://www.cars-data.com/en/bmw
Bugatti https://www.cars-data.com/en/bugatti
Buick https://www.cars-data.com/en/buick
Cadillac https://www.cars-data.com/en/cadillac
Carver https://www.cars-data.com/en/carver
Chevrolet https://www.cars-data.com/en/chevrolet
Chrysler https://www.cars-data.com/en/chrysler
Citroen https://www.cars-data.com/en/citroen
Corvette https://www.cars-data.com/en/corvette
Cupra https://www.cars-data.com/en/cupra
Dacia https://www.cars-data.com/en/dacia
Daewoo https://www.cars-data.com/en/daewoo
Daihat

In [33]:
clean_data(CarsData_models_df)
CarsData_models_df = CarsData_models_df[CarsData_models_df.model != '']
display(CarsData_models_df)

# save models to CSV file after scrapping
if(scrapData == 1):
    CarsData_models_df.to_csv('../datasets/cars-data/models-1.csv', encoding='utf-8', index=False)

Unnamed: 0,make,model,InnerLink
0,Abarth,Abarth 595 Cabrio,https://www.cars-data.com/en/abarth/595-cabrio
1,Aiways,Aiways U5,https://www.cars-data.com/en/aiways/u5
2,Alfa Romeo,Alfa Romeo 4C Spider,https://www.cars-data.com/en/alfa-romeo/4c-spider
3,Alpine,Alpine A110,https://www.cars-data.com/en/alpine/a110
4,Asia Motors,Asia Motors Rocsta,https://www.cars-data.com/en/asia-motors/rocsta
...,...,...,...
89,Triumph,Triumph TR7 Coupe,https://www.cars-data.com/en/triumph/tr7-coupe
90,TVR,TVR Chimaera,https://www.cars-data.com/en/tvr/chimaera
91,Volkswagen,Volkswagen Caddy Cargo,https://www.cars-data.com/en/volkswagen/caddy-...
92,Volvo,Volvo XC60,https://www.cars-data.com/en/volvo/xc60


In [35]:
if(model2_file_exists == True and scrapData == 0):
    CarsData_md_df = pd.read_csv ('../datasets/cars-data/models-2.csv')
else:
    CarsData_md_df = pd.DataFrame(columns=['make','model','ModelLink'])

    for m, CD_model_m in CarsData_models_df.iterrows():
        mm_soup = beautiful_soup(CD_model_m['InnerLink'])
        m_results = mm_soup.find_all("section", class_="models")

        m_models = m_results[0].find_all("a")
        for i, m_model in enumerate(m_models):
            inner_url = m_model['href'] if (is_url(m_model['href']) == True) else CarDataURL+m_model['href']
            print(inner_url)

            CarsData_md_df.loc[m, ['make']] = CD_model_m['make']
            CarsData_md_df.loc[m, ['model']] = m_model['title']
            CarsData_md_df.loc[m, ['ModelLink']] = inner_url

        sleep(randint(3,7))

https://www.cars-data.com/en/abarth
https://www.cars-data.com/en/abarth-595-cabrio-2016/3562
https://www.cars-data.com/en/aiways
https://www.cars-data.com/en/aiways-u5-2020/4733
https://www.cars-data.com/en/alfa-romeo
https://www.cars-data.com/en/alfa-romeo-4c-spider-2015/3254
https://www.cars-data.com/en/alpine
https://www.cars-data.com/en/alpine-a110-2018/4517
https://www.cars-data.com/en/asia-motors
https://www.cars-data.com/en/asia-motors-rocsta-1993/3025
https://www.cars-data.com/en/aston-martin
https://www.cars-data.com/en/aston-martin-db11-coupe-2016/3563
https://www.cars-data.com/en/audi
https://www.cars-data.com/en/audi-rs5-sportback-2020/4625
https://www.cars-data.com/en/audi-rs5-sportback-2019/4128
https://www.cars-data.com/en/austin
https://www.cars-data.com/en/austin-maestro-1983/3026
https://www.cars-data.com/en/autobianchi
https://www.cars-data.com/en/autobianchi-a112-1980/3035
https://www.cars-data.com/en/bentley
https://www.cars-data.com/en/bentley-continental-gt-conve

In [36]:
display(CarsData_md_df)

# save models to CSV file after scrapping
if(scrapData == 1):
    CarsData_md_df.to_csv('../datasets/cars-data/models-2.csv', encoding='utf-8', index=False)

Unnamed: 0,make,model,ModelLink
0,Abarth,Abarth 595 Cabrio,https://www.cars-data.com/en/abarth-595-cabrio...
1,Aiways,Aiways U5,https://www.cars-data.com/en/aiways-u5-2020/4733
2,Alfa Romeo,Alfa Romeo 4C Spider,https://www.cars-data.com/en/alfa-romeo-4c-spi...
3,Alpine,Alpine A110,https://www.cars-data.com/en/alpine-a110-2018/...
4,Asia Motors,Asia Motors Rocsta,https://www.cars-data.com/en/asia-motors-rocst...
...,...,...,...
89,Triumph,Triumph TR7 Coupe,https://www.cars-data.com/en/triumph-tr7-coupe...
90,TVR,TVR Chimaera,https://www.cars-data.com/en/tvr-chimaera-1994...
91,Volkswagen,Volkswagen Caddy Cargo,https://www.cars-data.com/en/volkswagen-caddy-...
92,Volvo,Volvo XC60,https://www.cars-data.com/en/volvo-xc60-2008/2968


In [42]:
scrapData = 0
if(modelType_file_exists == True and scrapData == 0):
    CD_mt_df = pd.read_csv ('../datasets/cars-data/modelTypes.csv')
else:
    CD_mt_df = pd.DataFrame(columns=['make','model','ModelType','ModelTypeLink'])
    j=0
    for t, CD_model_t in CarsData_md_df.iterrows():
        mt_soup = beautiful_soup(CD_model_t['ModelLink'])
        mt_results = mt_soup.find_all("section", class_="types")
        mt_results = mt_results[0].find("div", class_="col-8")

        mt_models = mt_results.find_all("a", class_=False) # get <a> tags which has no class attributes
        for i, mt_model in enumerate(mt_models):
            inner_url = mt_model['href'] if (is_url(mt_model['href']) == True) else CarDataURL+mt_model['href']

            CD_mt_df.loc[j, ['make']] = CD_model_t['make']
            CD_mt_df.loc[j, ['model']] = CD_model_t['model']
            CD_mt_df.loc[j, ['ModelType']] = mt_model['title']
            CD_mt_df.loc[j, ['ModelTypeLink']] = inner_url
            j=j+1


        sleep(randint(3,7))

In [43]:
display(CD_mt_df)

# save model types to CSV file after scrapping
if(scrapData == 1):
    CD_mt_df.to_csv('../datasets/cars-data/modelTypes.csv', encoding='utf-8', index=False)

Unnamed: 0,make,model,ModelType,ModelTypeLink
0,Abarth,Abarth 595 Cabrio,2016 Abarth 595 Cabrio 1.4 T-Jet 145 specs,https://www.cars-data.com/en/abarth-595-cabrio...
1,Abarth,Abarth 595 Cabrio,2016 Abarth 595 Cabrio 1.4 T-Jet 145 specs,https://www.cars-data.com/en/abarth-595-cabrio...
2,Abarth,Abarth 595 Cabrio,2016 Abarth 595 Cabrio 1.4 T-Jet 165 Turismo s...,https://www.cars-data.com/en/abarth-595-cabrio...
3,Abarth,Abarth 595 Cabrio,2016 Abarth 595 Cabrio 1.4 T-Jet 165 Turismo s...,https://www.cars-data.com/en/abarth-595-cabrio...
4,Abarth,Abarth 595 Cabrio,2016 Abarth 595 Cabrio 1.4 T-Jet 180 Competizi...,https://www.cars-data.com/en/abarth-595-cabrio...
...,...,...,...,...
497,Volvo,Volvo XC60,2008 Volvo XC60 D5 AWD Momentum specs,https://www.cars-data.com/en/volvo-xc60-d5-awd...
498,Volvo,Volvo XC60,2008 Volvo XC60 D5 AWD Summum specs,https://www.cars-data.com/en/volvo-xc60-d5-awd...
499,Volvo,Volvo XC60,2008 Volvo XC60 D5 AWD Summum specs,https://www.cars-data.com/en/volvo-xc60-d5-awd...
500,Yugo,Yugo 45/55,1983 Yugo 45 E specs,https://www.cars-data.com/en/yugo-45-e-specs/5...


In [72]:
scrapData == 1
if(rawData_file_exists == True and scrapData == 0):
    CD_ms_df = pd.read_csv ('../datasets/cars-data/raw-data.csv')
else:
    all_cols = ['make','model','generation','year_from','year_to','series','trim',
                'body_type','load_height_mm','number_of_seats','length_mm','width_mm','height_mm',
                'wheelbase_mm','front_track_mm','rear_track_mm','curb_weight_kg','wheel_size_r14',
                'ground_clearance_mm','trailer_load_with_brakes_kg','payload_kg','back_track_width_mm',
                'front_track_width_mm','clearance_mm','full_weight_kg','front_rear_axle_load_kg',
                'max_trunk_capacity_l','cargo_compartment_length_width_height_mm',
                'cargo_volume_m3','minimum_trunk_capacity_l','maximum_torque_n_m','injection_type',
                'overhead_camshaft','cylinder_layout','number_of_cylinders','compression_ratio',
                'engine_type','valves_per_cylinder','boost_type','cylinder_bore_mm','stroke_cycle_mm',
                'engine_placement','cylinder_bore_and_stroke_cycle_mm','turnover_of_maximum_torque_rpm',
                'max_power_kw','presence_of_intercooler','capacity_cm3','engine_hp','engine_hp_rpm',
                'drive_wheels','bore_stroke_ratio','number_of_gears','turning_circle_m','transmission',
                'mixed_fuel_consumption_per_100_km_l','range_km','emission_standards',
                'fuel_tank_capacity_l','acceleration_0_100_km/h_s','max_speed_km_per_h',
                'city_fuel_per_100km_l','CO2_emissions_g/km','fuel_grade','highway_fuel_per_100km_l',
                'back_suspension','rear_brakes','front_brakes','front_suspension','steering_type',
                'car_class','country_of_origin','number_of_doors','safety_assessment','rating_name',
                'battery_capacity_KW_per_h','electric_range_km','charging_time_h']
    # CD_ms_df = pd.DataFrame(columns=['Year','Brand','Model','ModelType','Engine','FuelType','Power(kW)',
    #                              'DriveWheel', 'TopSpeed', 'EngineCapacity', 'Source'])
    
    CD_ms_df = pd.DataFrame(columns=all_cols)
    sj=0
    for s, CD_model_s in CD_mt_df.iterrows():

        ms_soup = beautiful_soup(CD_model_s['ModelTypeLink'])
        ms_results = ms_soup.find_all("table")

        #display(ms_results)
        print(CD_model_s['ModelTypeLink'])
        for t, table in enumerate(ms_results):
            spec_rows = table.find_all("tr", class_=True)
            for i, spec_row in enumerate(spec_rows):
                spec = spec_row.find_all("td", class_=True)

                CD_ms_df.loc[sj, ['make']] = CD_model_s['make']
                CD_ms_df.loc[sj, ['model']] = CD_model_s['model']
                #CD_ms_df.loc[sj, ['ModelType']] = CD_model_s['ModelType']
                
                generation =  spec[1].text.strip() if (spec[0].text.strip() == 'Engine/motor Type:') else ''
                if(generation != ''):
                    CD_ms_df.loc[sj, ['generation']] =  generation

                year_from = CD_model_s['ModelType'].split()
                CD_ms_df.loc[sj, ['year_from']] = year_from[0]

                year_to = CD_model_s['ModelType'].split()
                CD_ms_df.loc[sj+t, ['year_to']] = year_from[0]
                
                trim =  spec[1].text.strip() if (spec[0].text.strip() == 'Trim:') else ''
                if(trim != ''):
                    CD_ms_df.loc[sj, ['trim']] =  trim

                body_type =  spec[1].text.strip() if (spec[0].text.strip() == 'Body Type:') else ''
                if(body_type != ''):
                    CD_ms_df.loc[sj, ['body_type']] =  body_type

                load_height_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Height (mm):') else ''
                if(load_height_mm != ''):
                    CD_ms_df.loc[sj, ['load_height_mm']] =  load_height_mm

                number_of_seats =  spec[1].text.strip() if (spec[0].text.strip() == 'Number Of Seats:') else ''
                if(number_of_seats != ''):
                    CD_ms_df.loc[sj, ['number_of_seats']] =  number_of_seats

                length_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Length:') else ''
                if(length_mm != ''):
                    CD_ms_df.loc[sj, ['length_mm']] =  length_mm

                width_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Width:') else ''
                if(width_mm != ''):
                    CD_ms_df.loc[sj, ['width_mm']] =  width_mm

                height_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Height:') else ''
                if(height_mm != ''):
                    CD_ms_df.loc[sj, ['height_mm']] =  height_mm

                wheelbase_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Wheel Base:') else ''
                if(wheelbase_mm != ''):
                    CD_ms_df.loc[sj, ['wheelbase_mm']] =  wheelbase_mm

                front_track_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Front Tire Size:') else ''
                if(front_track_mm != ''):
                    CD_ms_df.loc[sj, ['front_track_mm']] =  front_track_mm

                rear_track_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Rear Tire Size:') else ''
                if(rear_track_mm != ''):
                    CD_ms_df.loc[sj, ['rear_track_mm']] =  rear_track_mm

                curb_weight_kg =  spec[1].text.strip() if (spec[0].text.strip() == 'Curb Weight:') else ''
                if(curb_weight_kg != ''):
                    CD_ms_df.loc[sj, ['curb_weight_kg']] =  curb_weight_kg

                wheel_size_r14 =  spec[1].text.strip() if (spec[0].text.strip() == 'Wheel Size:') else ''
                if(wheel_size_r14 != ''):
                    CD_ms_df.loc[sj, ['wheel_size_r14']] =  wheel_size_r14

                ground_clearance_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Ground Clearance:') else ''
                if(ground_clearance_mm != ''):
                    CD_ms_df.loc[sj, ['ground_clearance_mm']] =  ground_clearance_mm

                trailer_load_with_brakes_kg =  spec[1].text.strip() if (spec[0].text.strip() == 'Trailer with brakes (Kg):') else ''
                if(trailer_load_with_brakes_kg != ''):
                    CD_ms_df.loc[sj, ['trailer_load_with_brakes_kg']] =  trailer_load_with_brakes_kg

                payload_kg =  spec[1].text.strip() if (spec[0].text.strip() == 'Payload (Kg):') else ''
                if(payload_kg != ''):
                    CD_ms_df.loc[sj, ['payload_kg']] =  payload_kg

                front_track_width_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Front Track width (mm)') else ''
                if(front_track_width_mm != ''):
                    CD_ms_df.loc[sj, ['front_track_width_mm']] =  front_track_width_mm

                clearance_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Clearance (mm):') else ''
                if(clearance_mm != ''):
                    CD_ms_df.loc[sj, ['clearance_mm']] =  clearance_mm

                full_weight_kg =  spec[1].text.strip() if (spec[0].text.strip() == 'Full weight (Kg):') else ''
                if(full_weight_kg != ''):
                    CD_ms_df.loc[sj, ['full_weight_kg']] =  full_weight_kg

                front_rear_axle_load_kg =  spec[1].text.strip() if (spec[0].text.strip() == 'Front Rear axle load (kg):') else ''
                if(front_rear_axle_load_kg != ''):
                    CD_ms_df.loc[sj, ['front_rear_axle_load_kg']] =  front_rear_axle_load_kg

                max_trunk_capacity_l =  spec[1].text.strip() if (spec[0].text.strip() == 'Max trunk capacity (L):') else ''
                if(max_trunk_capacity_l != ''):
                    CD_ms_df.loc[sj, ['max_trunk_capacity_l']] =  max_trunk_capacity_l

                cargo_compartment_length_width_height_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Cargo Compartment (mm):') else ''
                if(cargo_compartment_length_width_height_mm != ''):
                    CD_ms_df.loc[sj, ['cargo_compartment_length_width_height_mm']] =  cargo_compartment_length_width_height_mm

                cargo_volume_m3 =  spec[1].text.strip() if (spec[0].text.strip() == 'Cargo volume (m3):') else ''
                if(cargo_volume_m3 != ''):
                    CD_ms_df.loc[sj, ['cargo_volume_m3']] =  cargo_volume_m3


                minimum_trunk_capacity_l =  spec[1].text.strip() if (spec[0].text.strip() == 'Min Trunk Capacity (L):') else ''
                if(minimum_trunk_capacity_l != ''):
                    CD_ms_df.loc[sj, ['minimum_trunk_capacity_l']] =  minimum_trunk_capacity_l


                maximum_torque_n_m =  spec[1].text.strip() if (spec[0].text.strip() == 'Max torque(nm):') else ''
                if(maximum_torque_n_m != ''):
                    CD_ms_df.loc[sj, ['maximum_torque_n_m']] =  maximum_torque_n_m


                injection_type =  spec[1].text.strip() if (spec[0].text.strip() == 'Injection type:') else ''
                if(injection_type != ''):
                    CD_ms_df.loc[sj, ['injection_type']] =  injection_type


                overhead_camshaft =  spec[1].text.strip() if (spec[0].text.strip() == 'Overhead camshaft:') else ''
                if(overhead_camshaft != ''):
                    CD_ms_df.loc[sj, ['overhead_camshaft']] =  overhead_camshaft


                cylinder_layout =  spec[1].text.strip() if (spec[0].text.strip() == 'Cylinder layout:') else ''
                if(cylinder_layout != ''):
                    CD_ms_df.loc[sj, ['cylinder_layout']] =  cylinder_layout


                number_of_cylinders =  spec[1].text.strip() if (spec[0].text.strip() == 'No. of cylinder:') else ''
                if(number_of_cylinders != ''):
                    CD_ms_df.loc[sj, ['number_of_cylinders']] =  number_of_cylinders


                compression_ratio =  spec[1].text.strip() if (spec[0].text.strip() == 'Compression ratio:') else ''
                if(compression_ratio != ''):
                    CD_ms_df.loc[sj, ['compression_ratio']] =  compression_ratio
                
                engine_type =  spec[1].text.strip() if (spec[0].text.strip() == 'Engine/motor Type:') else ''
                if(engine_type != ''):
                    CD_ms_df.loc[sj, ['engine_type']] =  engine_type

                valves_per_cylinder =  spec[1].text.strip() if (spec[0].text.strip() == 'Valves Per Cylinder:') else ''
                if(valves_per_cylinder != ''):
                    CD_ms_df.loc[sj, ['valves_per_cylinder']] =  valves_per_cylinder

                boost_type =  spec[1].text.strip() if (spec[0].text.strip() == 'Boost type:') else ''
                if(boost_type != ''):
                    CD_ms_df.loc[sj, ['boost_type']] =  boost_type

                cylinder_bore_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Cylinder bore (mm):') else ''
                if(cylinder_bore_mm != ''):
                    CD_ms_df.loc[sj, ['cylinder_bore_mm']] =  cylinder_bore_mm

                stroke_cycle_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Stroke cycle (mm):') else ''
                if(stroke_cycle_mm != ''):
                    CD_ms_df.loc[sj, ['stroke_cycle_mm']] =  stroke_cycle_mm

                engine_placement =  spec[1].text.strip() if (spec[0].text.strip() == 'Engine placement:') else ''
                if(engine_placement != ''):
                    CD_ms_df.loc[sj, ['engine_placement']] =  engine_placement

                cylinder_bore_and_stroke_cycle_mm =  spec[1].text.strip() if (spec[0].text.strip() == 'Bore X Stroke:') else ''
                if(cylinder_bore_and_stroke_cycle_mm != ''):
                    CD_ms_df.loc[sj, ['cylinder_bore_and_stroke_cycle_mm']] =  cylinder_bore_and_stroke_cycle_mm

                turnover_of_maximum_torque_rpm =  spec[1].text.strip() if (spec[0].text.strip() == 'Max Torque Rpm:') else ''
                if(turnover_of_maximum_torque_rpm != ''):
                    CD_ms_df.loc[sj, ['turnover_of_maximum_torque_rpm']] =  turnover_of_maximum_torque_rpm

                
                max_power_kw = spec[1].text.strip() if (spec[0].text.strip() == 'Total Max. Power (kW):') else ''
                if(max_power_kw != ''):
                    CD_ms_df.loc[sj, ['max_power_kw']] = max_power_kw

                presence_of_intercooler =  spec[1].text.strip() if (spec[0].text.strip() == 'Turbo:') else ''
                if(presence_of_intercooler != ''):
                    CD_ms_df.loc[sj, ['presence_of_intercooler']] =  presence_of_intercooler

                capacity_cm3 =  spec[1].text.strip() if (spec[0].text.strip() == 'Capacity (cm3):') else ''
                if(capacity_cm3 != ''):
                    CD_ms_df.loc[sj, ['capacity_cm3']] =  capacity_cm3

                
                engine_hp =  spec[1].text.strip() if (spec[0].text.strip() == 'Power (hp):') else ''
                if(engine_hp != ''):
                    CD_ms_df.loc[sj, ['engine_hp']] = engine_hp

                engine_hp_rpm =  spec[1].text.strip() if (spec[0].text.strip() == 'Max. Power Rpm:') else ''
                if(engine_hp_rpm != ''):
                    CD_ms_df.loc[sj, ['engine_hp_rpm']] =  engine_hp_rpm
                    
                drive_wheels = spec[1].text.strip() if (spec[0].text.strip() == 'Drive Wheel :') else ''
                if(drive_wheels != ''):
                    CD_ms_df.loc[sj, ['drive_wheels']] = drive_wheels

                bore_stroke_ratio =  spec[1].text.strip() if (spec[0].text.strip() == 'Bore X Stroke:') else ''
                if(bore_stroke_ratio != ''):
                    CD_ms_df.loc[sj, ['bore_stroke_ratio']] =  bore_stroke_ratio

                number_of_gears =  spec[1].text.strip() if (spec[0].text.strip() == 'No of gears:') else ''
                if(number_of_gears != ''):
                    CD_ms_df.loc[sj, ['number_of_gears']] =  number_of_gears

                turning_circle_m =  spec[1].text.strip() if (spec[0].text.strip() == 'Turning Circle :') else ''
                if(turning_circle_m != ''):
                    CD_ms_df.loc[sj, ['turning_circle_m']] =  turning_circle_m

                transmission =  spec[1].text.strip() if (spec[0].text.strip() == 'Transmission:') else ''
                if(transmission != ''):
                    CD_ms_df.loc[sj, ['transmission']] =  transmission

                mixed_fuel_consumption_per_100_km_l =  spec[1].text.strip() if (spec[0].text.strip() == 'Practice Consumption Monitor:') else ''
                if(mixed_fuel_consumption_per_100_km_l != ''):
                    CD_ms_df.loc[sj, ['mixed_fuel_consumption_per_100_km_l']] =  mixed_fuel_consumption_per_100_km_l

                range_km =  spec[1].text.strip() if (spec[0].text.strip() == 'Range:') else ''
                if(range_km != ''):
                    CD_ms_df.loc[sj, ['range_km']] =  range_km

                emission_standards =  spec[1].text.strip() if (spec[0].text.strip() == 'Co2 Emissions:') else ''
                if(emission_standards != ''):
                    CD_ms_df.loc[sj, ['emission_standards']] =  emission_standards

                fuel_tank_capacity_l =  spec[1].text.strip() if (spec[0].text.strip() == 'Fuel Tank Capacity:') else ''
                if(fuel_tank_capacity_l != ''):
                    CD_ms_df.loc[sj, ['fuel_tank_capacity_l']] =  fuel_tank_capacity_l

                acceleration_0_100_km =  spec[1].text.strip() if (spec[0].text.strip() == 'Acceleration 0-100 Km / H:') else ''
                if(acceleration_0_100_km != ''):
                    CD_ms_df.loc[sj, ['acceleration_0_100_km/h_s']] =  acceleration_0_100_km

                
                max_speed_km_per_h =  spec[1].text.strip() if (spec[0].text.strip() == 'Top Speed:') else ''
                if(max_speed_km_per_h != ''):
                    CD_ms_df.loc[sj, ['max_speed_km_per_h']] = max_speed_km_per_h

                city_fuel_per_100km_l =  spec[1].text.strip() if (spec[0].text.strip() == 'Urban Consumption:') else ''
                if(city_fuel_per_100km_l != ''):
                    CD_ms_df.loc[sj, ['city_fuel_per_100km_l']] =  city_fuel_per_100km_l

                CO2_emissions_g =  spec[1].text.strip() if (spec[0].text.strip() == 'Co2 Emissions:') else ''
                if(CO2_emissions_g != ''):
                    CD_ms_df.loc[sj, ['CO2_emissions_g/km']] =  CO2_emissions_g

                fuel_grade =  spec[1].text.strip() if (spec[0].text.strip() == 'Fuel Grade:') else ''
                if(fuel_grade != ''):
                    CD_ms_df.loc[sj, ['fuel_grade']] =  fuel_grade

                highway_fuel_per_100km_l =  spec[1].text.strip() if (spec[0].text.strip() == 'Combined Consumption:') else ''
                if(highway_fuel_per_100km_l != ''):
                    CD_ms_df.loc[sj, ['highway_fuel_per_100km_l']] =  highway_fuel_per_100km_l

                back_suspension =  spec[1].text.strip() if (spec[0].text.strip() == 'Rear Suspension:') else ''
                if(back_suspension != ''):
                    CD_ms_df.loc[sj, ['back_suspension']] =  back_suspension

                rear_brakes =  spec[1].text.strip() if (spec[0].text.strip() == 'Rear Brakes:') else ''
                if(rear_brakes != ''):
                    CD_ms_df.loc[sj, ['rear_brakes']] =  rear_brakes

                front_brakes =  spec[1].text.strip() if (spec[0].text.strip() == 'Front Brakes:') else ''
                if(front_brakes != ''):
                    CD_ms_df.loc[sj, ['front_brakes']] =  front_brakes

                front_suspension =  spec[1].text.strip() if (spec[0].text.strip() == 'Front Suspension:') else ''
                if(front_suspension != ''):
                    CD_ms_df.loc[sj, ['front_suspension']] =  front_suspension

                steering_type =  spec[1].text.strip() if (spec[0].text.strip() == 'Steering Type:') else ''
                if(steering_type != ''):
                    CD_ms_df.loc[sj, ['steering_type']] =  steering_type

                car_class =  spec[1].text.strip() if (spec[0].text.strip() == 'Car class:') else ''
                if(car_class != ''):
                    CD_ms_df.loc[sj, ['car_class']] =  car_class

                country_of_origin =  spec[1].text.strip() if (spec[0].text.strip() == 'Country of origin:') else ''
                if(country_of_origin != ''):
                    CD_ms_df.loc[sj, ['country_of_origin']] =  country_of_origin

                number_of_doors =  spec[1].text.strip() if (spec[0].text.strip() == 'Body Type:') else ''
                if(number_of_doors != ''):
                    number_of_doors = number_of_doors.split(',')
                    CD_ms_df.loc[sj, ['number_of_doors']] =  number_of_doors[0]

                safety_assessment =  spec[1].text.strip() if (spec[0].text.strip() == 'Safety Assessment:') else ''
                if(safety_assessment != ''):
                    CD_ms_df.loc[sj, ['safety_assessment']] =  safety_assessment

                rating_name =  spec[1].text.strip() if (spec[0].text.strip() == 'Rating:') else ''
                if(rating_name != ''):
                    CD_ms_df.loc[sj, ['rating_name']] =  rating_name

                battery_capacity_KW_per_h =  spec[1].text.strip() if (spec[0].text.strip() == 'Battery Range:') else ''
                if(battery_capacity_KW_per_h != ''):
                    CD_ms_df.loc[sj, ['battery_capacity_KW_per_h']] =  battery_capacity_KW_per_h

                electric_range_km =  spec[1].text.strip() if (spec[0].text.strip() == 'Electric range (km):') else ''
                if(electric_range_km != ''):
                    CD_ms_df.loc[sj, ['electric_range_km']] =  electric_range_km

                charging_time_h =  spec[1].text.strip() if (spec[0].text.strip() == 'Charging time (h):') else ''
                if(charging_time_h != ''):
                    CD_ms_df.loc[sj, ['charging_time_h']] =  charging_time_h


        sleep(randint(3,7))
        sj=sj+1


https://www.cars-data.com/en/abarth-595-cabrio-1-4-t-jet-145-specs/73534
https://www.cars-data.com/en/abarth-595-cabrio-1-4-t-jet-145-specs/73535
https://www.cars-data.com/en/abarth-595-cabrio-1-4-t-jet-165-turismo-specs/73536
https://www.cars-data.com/en/abarth-595-cabrio-1-4-t-jet-165-turismo-specs/73537
https://www.cars-data.com/en/abarth-595-cabrio-1-4-t-jet-180-competizione-specs/73538
https://www.cars-data.com/en/abarth-595-cabrio-1-4-t-jet-180-competizione-specs/73539
https://www.cars-data.com/en/aiways-u5-standaard-specs/84688
https://www.cars-data.com/en/aiways-u5-showroom-specs/84689
https://www.cars-data.com/en/aiways-u5-premium-specs/84690
https://www.cars-data.com/en/alfa-romeo-4c-spider-1-750-tbi-specs/66221
https://www.cars-data.com/en/alpine-a110-pure-specs/85884
https://www.cars-data.com/en/alpine-a110-legende-specs/85885
https://www.cars-data.com/en/alpine-a110-premiere-edition-specs/85886
https://www.cars-data.com/en/asia-motors-rocsta-1-8-softtop-specs/68781
https:/

In [73]:
#CD_ms_df = CD_ms_df.dropna()
display(CD_ms_df)

# save model types to CSV file after scrapping
if(scrapData == 1):
    CD_ms_df.to_csv('../datasets/cars-data/raw-data.csv', encoding='utf-8', index=False)

Unnamed: 0,make,model,generation,year_from,year_to,series,trim,body_type,load_height_mm,number_of_seats,...,front_suspension,steering_type,car_class,country_of_origin,number_of_doors,safety_assessment,rating_name,battery_capacity_KW_per_h,electric_range_km,charging_time_h
0,Abarth,Abarth 595 Cabrio,fuel engine,2016,2016,,,"3-doors, convertible",,,...,coil springs,,,,3-doors,,,,,
1,Abarth,Abarth 595 Cabrio,fuel engine,2016,2016,,,"3-doors, convertible",,,...,coil springs,,,,3-doors,,,,,
2,Abarth,Abarth 595 Cabrio,fuel engine,2016,2016,,,"3-doors, convertible",,,...,coil springs,,,,3-doors,,,,,
3,Abarth,Abarth 595 Cabrio,fuel engine,2016,2016,,,"3-doors, convertible",,,...,coil springs,,,,3-doors,,,,,
4,Abarth,Abarth 595 Cabrio,fuel engine,2016,2016,,,"3-doors, convertible",,,...,coil springs,,,,3-doors,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516,,,,,1983,,,,,,...,,,,,,,,,,
517,,,,,1983,,,,,,...,,,,,,,,,,
518,,,,,1983,,,,,,...,,,,,,,,,,
519,,,,,1983,,,,,,...,,,,,,,,,,
