In [None]:
pip install requests beautifulsoup4




In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import re

def get_soup(url):
    retries = 5
    backoff_factor = 1

    for attempt in range(retries):
        try:
            response = requests.get(url)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except requests.exceptions.RequestException as e:
            if attempt < retries - 1:
                time.sleep(backoff_factor * (2 ** attempt))
                continue
            else:
                print(f"Failed to retrieve {url}: {e}")
                return None

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def extract_numeric_value(text):
    numeric_value = re.findall(r'\d+', text)
    return int(numeric_value[0]) if numeric_value else None

def preprocess_car_data(car):
    title = clean_text(car.find('div', class_='newCarListUnit_header').find('a').text)
    car_model_name = clean_text(car.find('div', class_='newCarListUnit_metaTags').find_all('span', class_='newCarListUnit_metaLink')[1].text)
    car_color = clean_text(car.find('span', class_='newCarListUnit_metaTag mob_hidden').text)
    car_mileage = ""
    meta_tags = car.find_all('span', class_='newCarListUnit_metaTag')
    for tag in meta_tags:
        if "كم" in tag.text:
            car_mileage = clean_text(tag.text)
            break
    car_mileage = extract_numeric_value(car_mileage)  
    location = clean_text(car.find('div', class_='newCarListUnit_metaTags').find_all('span', class_='newCarListUnit_metaLink')[-1].text)
    date = clean_text(car.find('div', class_='otherData_Date').find('span').text)
    price = clean_text(car.find('div', class_='main_price').find('a').text)
    price = extract_numeric_value(price)  
    image = car.find('img', class_='lazy')['data-original']

    return {
        'title': title,
        'car_model_name': car_model_name,
        'car_color': car_color,
        'car_mileage': car_mileage if car_mileage is not None else 0,  
        'location': location,
        'date': date,
        'price': price if price is not None else 0, 
        'image': image
    }

def scrape_car_data(base_url, num_pages):
    car_data = []

    for page in range(1, num_pages + 1):
        url = f"{base_url}/page/{page}"
        soup = get_soup(url)

        if soup is None:
            continue

        for car in soup.find_all('div', class_='newCarListUnit_wrap'):
            car_details = preprocess_car_data(car)
            car_data.append(car_details)

        if len(car_data) >= 1000:
            break

    return car_data[:1000]

def save_to_csv(data, filename):
    with open(filename, mode='w', newline='', encoding='utf-8-sig') as file:
        writer = csv.DictWriter(file, fieldnames=['title', 'car_model_name', 'car_color', 'car_mileage', 'location', 'date', 'price', 'image'])
        writer.writeheader()
        writer.writerows(data)

    print(f'Data has been written to {filename}')

def download_csv(base_url, num_pages, filename):
    data = scrape_car_data(base_url, num_pages)
    save_to_csv(data, filename)

base_url = 'https://eg.hatla2ee.com/ar/car/mercedes'
num_pages = 50  
csv_file = 'car_data.csv'

download_csv(base_url, num_pages, csv_file)


In [34]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

file_path = '/content/car_data.csv'
car_data = pd.read_csv(file_path)

def convert_to_numeric(value):
    if isinstance(value, str):
        return int(value.replace(',', '').replace(' كم', '').replace(' جنيه', '').replace('-', '0'))
    return value

car_data['car_mileage'] = car_data['car_mileage'].apply(convert_to_numeric)
car_data['price'] = car_data['price'].apply(convert_to_numeric)

car_data['model_year'] = car_data['title'].str.extract(r'(\d{4})').astype(float)

car_data = car_data.dropna(subset=['model_year'])

car_data_encoded = pd.get_dummies(car_data, columns=['car_model_name', 'car_color', 'location'], drop_first=True)

car_data_encoded = car_data_encoded.drop(columns=['title', 'date', 'image'])

X = car_data_encoded.drop(columns=['price'])
y = car_data_encoded['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_features = ['car_mileage', 'model_year']
categorical_features = [col for col in X_train.columns if col not in numerical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', 'passthrough', categorical_features)])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestRegressor(random_state=42))])

param_distributions = {
    'model__n_estimators': [50, 100, 200],
    'model__max_features': ['auto', 'sqrt'],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__bootstrap': [True, False]
}

random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=50, cv=5,
                                   verbose=2, random_state=42, n_jobs=-1)


random_search.fit(X_train, y_train)

y_pred = random_search.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R²) Score:", r2)


sample_input = pd.DataFrame(columns=X_train.columns)

sample_input.loc[0] = 0  
sample_input['car_mileage'] = 50000  
sample_input['model_year'] = 2020  
sample_input['car_model_name_230'] = 1  
sample_input['car_color_أسود'] = 1  
sample_input['location_القاهرة'] = 1  


predicted_price = random_search.predict(sample_input)
print("Predicted Price:", predicted_price[0])
