In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import json
import datetime
import time
import re
from azure.storage.filedatalake import DataLakeServiceClient
import os
from dotenv import load_dotenv
import io

ModuleNotFoundError: No module named 'pandas'

In [3]:
def page_number(start, end, base_url):
    list_page = []
    for i in range(start, end + 1):
        list_page.append(base_url.replace('page_number=X', f'page_number={i}'))

    return list_page

In [4]:
def extract_url(text):
    # Pattern for URL
    url_pattern = r'(https://www\.carlist\.my/[\w-]+/[^\s]+?)\.?(?=\s|$)'
    
    # Extract URL
    url_match = re.search(url_pattern, text)
    url = url_match.group(1) if url_match else None
    
    return url

In [5]:
def extract_json_file(indv_url):
    
    article1_data = {}
    try:
        headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"}
        webpage = requests.get(url = indv_url, headers= headers)
        soup2 = bs(webpage.text, 'html.parser')
        article1 = soup2.find('script', {'type': 'application/ld+json'})
                
        if article1 and article1.string: 
            try: 
                json_data = json.loads(article1.string)
                if isinstance(json_data,list) and len(json_data) > 0:
                    try: 
                        json_data = json_data[0]
                        article1_data.update({ 
                            'model': json_data.get('model'),
                            'title': json_data.get('name'),
                            'year': json_data.get('vehicleModelDate'),
                            'color': json_data.get('color'),
                            'mileage': json_data['mileageFromOdometer'].get('value'),
                            'price': json_data['offers'].get('price'),
                            'seller': json_data['offers']['seller'].get('@type'),
                            'location': json_data['offers']['seller']['homeLocation']['address'].get('addressLocality'),
                            'state': json_data['offers']['seller']['homeLocation']['address'].get('addressRegion'),
                            'url': indv_url,
                            'image': json_data.get('image')
                            })
                        
                    except (KeyError, TypeError) as e:
                        print(f'Error accessing JSON: {e}')
                return article1_data
            except json.JSONDecodeError as e:
                print(f'Error parsing JSON: {e}')
        print('u')
        #time.sleep(2)
        return article1_data
    
    except:
        print(f'Error in extract_json_file: {e}')
        return article1_data

In [8]:
def main_extraction(page_url):
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"}
    page = requests.get(url = page_url, headers=headers)
    soup = bs(page.text, 'html.parser')

    articles = soup.findAll('article',{
            'class': ['js--listing', 'article--details']
        })

    if articles:

        json_results = []

        for article in articles:
            try:
                default_text = article.get('data-default-line-text', '')
                url = extract_url(default_text)
                article_data = extract_json_file(url)
                article_data['listing_id'] = article.get('data-listing-id')
                article_data['installment'] =  article.get('data-installment')
                article_data['variant'] = article.get('data-variant', '')
                article_data['transmission'] = article.get('data-transmission', '')

                if article_data:
                    json_results.append(article_data)
                
            except Exception as e:
                print(f"Error processing article {article.get('data-listing-id', 'unknown')}: {str(e)}")
                continue  # Skip this article and continue with next one
        
        return json_results
    return []

main_extraction('https://www.carlist.my/cars-for-sale/toyota/vios/malaysia?page_number=133&page_size=25&sort=modification_date_search.desc')

[{'model': 'Vios',
  'title': '2005 Toyota Vios 1.5 G Sedan',
  'year': 2005,
  'color': '-',
  'mileage': 172500,
  'price': 16500,
  'seller': 'Person',
  'location': 'Johor Bahru',
  'state': 'Johor',
  'url': 'https://www.carlist.my/used-cars/2005-toyota-vios-1-5-g-sedan/12345365',
  'image': ['https://img1.icarcdn.com/56354321/thumb-l_used-car-carlist-toyota-vios-g-sedan-malaysia_000056354321_e744938c_a8f6_488b_aab6_1660bf0af975.jpg.webp?smia=xTM',
   'https://img1.icarcdn.com/56354321/thumb-l_used-car-carlist-toyota-vios-g-sedan-malaysia_000056354321_4543bab6_e787_42bd_821a_27a7b34ab067.jpg.webp?smia=xTM',
   'https://img1.icarcdn.com/56354321/thumb-l_used-car-carlist-toyota-vios-g-sedan-malaysia_000056354321_7fd3dc82_35e3_4601_a779_2fc2d5e4aa63.jpg.webp?smia=xTM',
   'https://img1.icarcdn.com/56354321/thumb-l_used-car-carlist-toyota-vios-g-sedan-malaysia_000056354321_3f1a9e14_28c1_4844_8f42_6ad353b1b23e.jpg.webp?smia=xTM',
   'https://img1.icarcdn.com/56354321/thumb-l_used-car-c

In [None]:
def upload_to_datalake(csv_buffer): 

    try:

        # Load environment variables from .env file
        load_dotenv()

        container_name = os.getenv('AZURE_CONTAINER_NAME')
        directory_path = os.getenv('AZURE_DIRECTORY_PATH')
        connection_string = os.getenv('AZURE_DATA_LAKE_CONNECTION_STRING')
        
        # Get connection string and other parameters from environment variables
        connection_string = os.getenv('AZURE_DATA_LAKE_CONNECTION_STRING')
        if connection_string is None:
            raise ValueError("Connection string not found")
        
        # Create a DataLakeServiceClient
        service_client = DataLakeServiceClient.from_connection_string(connection_string)
        
        # Ensure paths don't have leading/trailing slashes
        directory_path = directory_path.strip('/')
        container_name = container_name.strip()
        
        # Get file system client (container)
        file_system_client = service_client.get_file_system_client(file_system=container_name)
        
        # Get directory client
        directory_client = file_system_client.get_directory_client(directory_path)
        
        # Generate a unique file name (you can customize this as needed)
        file_name = f"vios_{pd.Timestamp.now().strftime('%Y%m%d%H%M%S')}.csv"
        
        # Debug print
        print(f"Attempting to upload file: {file_name}")
        
        # Create directory if it doesn't exist (removed exist_ok parameter)
        try:
            directory_client.create_directory()
        except Exception as e:
            # Directory might already exist, continue
            print(f"Directory might already exist: {str(e)}")
        
        # Get file client
        file_client = directory_client.get_file_client(file_name)
        
        # Upload the CSV data from the buffer
        file_client.upload_data(csv_buffer.getvalue(), overwrite=True)
        
        print(f"File uploaded successfully to Azure Data Lake: {directory_path}/{file_name}")
        
    except Exception as e:
        print(f"Error uploading to Data Lake: {str(e)}")
        # Print full error traceback for debugging
        import traceback
        print(traceback.format_exc())
    

In [30]:
base_url = ['https://www.carlist.my/cars-for-sale/perodua/myvi/malaysia?page_number=X&page_size=25&sort=modification_date_search.desc', \
            'https://www.carlist.my/cars-for-sale/honda/city/malaysia?page_number=X&page_size=25&sort=modification_date_search.desc', \
            'https://www.carlist.my/cars-for-sale/toyota/vios/malaysia?page_number=X&page_size=25&sort=modification_date_search.desc' ]

for x in base_url:
    all_json_data = []
    page_list = page_number(1,2,x)
    # print(page_list)
    try:
        for index,item in enumerate(page_list):
            results = main_extraction(item)
            if results:
                all_json_data.extend(results)
                print(f"Added {len(results)} entries from page {index + 1}")
            else:
                print(f"Processed item {index + 1}: No entries found")

            
    except Exception as e:
        print(f"Error in main processing: {str(e)}")
    
    if all_json_data:
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f'car_listings_{timestamp}.json'
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(all_json_data, f, indent=2, ensure_ascii=False)
            print(f"Saved {len(all_json_data)} entries to {filename}")
                
# Save all collected JSON data to a file


Added 25 entries from page 1
Added 25 entries from page 2
Saved 50 entries to car_listings_20250117_232408.json
Added 25 entries from page 1
Added 25 entries from page 2
Saved 50 entries to car_listings_20250117_232429.json
Added 25 entries from page 1
Added 25 entries from page 2
Saved 50 entries to car_listings_20250117_232442.json
