# **Import Libraries**

In [78]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn import tree
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings("ignore")

# **Scrapping Web page**

In [14]:
def scrape_data(url):
  # Create an empty list to store the extracted data
  results = []
  for i in ['items-i','items-i vipped', 'items-i featured vipped']:
      # Send a request to the URL and get the HTML content
      response = requests.get(url)
      soup = BeautifulSoup(response.content, 'html.parser')

      # Find all the items with class "items-i vipped"
      items = soup.find_all('div', class_= i)

      for item in items:
          # Extract the desired information for each item
          price = item.find('span', class_='price-val').text.strip()
          currency = item.find('span', class_='price-cur').text.strip()

          # Use a conditional check to handle the case when the element is not found
          price_per_element = item.find('span', class_='price-per')
          per = price_per_element.text.strip() if price_per_element else ""

          location = item.find('div', class_='location').text.strip()

          # Extract the data from the list elements with class "name"
          name_elements = item.find('ul', class_='name').find_all('li')
          name = [element.text.strip() for element in name_elements]

          # Append the extracted data as a dictionary to the results list
          results.append({
              'price': price,
              'currency': currency,
              'per': per,
              'location': location,
              'name': name
          })

  return results

In [15]:
import pandas as pd


def scrape_multiple_urls(urls):
    # Create lists to store the extracted data from all URLs
    all_prices = []
    all_currency = []
    all_locations = []
    all_names = []

    # Loop through each URL and scrape the data
    for url in urls:
        data = scrape_data(url)

        # Extract the data from the list of dictionaries
        for item_data in data:
            all_prices.append(item_data['price'])
            all_currency.append(item_data['currency'] + item_data['per'])
            all_locations.append(item_data['location'])
            all_names.append(item_data['name'])

    # Create a Pandas DataFrame
    df = pd.DataFrame({
        'Price': all_prices,
        'Currency': all_currency,
        'Location': all_locations,
        'Description': all_names
    })

    return df

# List of URLs to scrape
urls = [f"https://bina.az/baki/alqi-satqi/menziller?page={i}" for i in range(1, 100)]

# Call the function to scrape multiple URLs and store the data in one DataFrame
df = scrape_multiple_urls(urls)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4068 entries, 0 to 4067
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Price        4068 non-null   object
 1   Currency     4068 non-null   object
 2   Location     4068 non-null   object
 3   Description  4068 non-null   object
dtypes: object(4)
memory usage: 127.2+ KB


In [17]:
df.head()

Unnamed: 0,Price,Currency,Location,Description
0,750 000,AZN,Nəsimi r.,"[5 otaqlı, 217 m², 11/16 mərtəbə]"
1,269 000,AZN,Yasamal r.,"[3 otaqlı, 116 m², 16/25 mərtəbə]"
2,216 000,AZN,İnşaatçılar m.,"[3 otaqlı, 125 m², 8/18 mərtəbə]"
3,370 000,AZN,Nəsimi r.,"[3 otaqlı, 155.5 m², 12/20 mərtəbə]"
4,367 983,AZN,Sea Breeze,"[3 otaqlı, 127.3 m²]"


# **Data Cleaning**

In [18]:
# Function to extract values from the list in the Description column
def extract_value(lst, pattern):
    for item in lst:
        if pattern in item:
            return item.split()[0]
    return None

# Extract Rooms, Size, and Floor using the extract_value function
df['Rooms'] = df['Description'].apply(extract_value, pattern='otaqlı')
df['Size'] = df['Description'].apply(extract_value, pattern='m²')
df['Floor'] = df['Description'].apply(extract_value, pattern='mərtəbə')

# Drop the original Description column if you don't need it anymore
df.drop(columns=['Description'], inplace=True)

In [32]:
# Split the 'Floor' column into two separate columns
df[['Floor Number', 'Total Floors']] = df['Floor'].str.split('/', expand=True)

# Convert the new columns to numeric values (optional, if you want them as integers)
df['Floor Number'] = pd.to_numeric(df['Floor Number'])
df['Total Floors'] = pd.to_numeric(df['Total Floors'])

# Drop the original 'Floor' column if needed
df.drop(columns='Floor', inplace=True)

In [33]:
df.head()

Unnamed: 0,Price,Currency,Location,Rooms,Size,Floor Number,Total Floors
0,750000,AZN,Nəsimi r.,5,217.0,11.0,16.0
1,269000,AZN,Yasamal r.,3,116.0,16.0,25.0
2,216000,AZN,İnşaatçılar m.,3,125.0,8.0,18.0
3,370000,AZN,Nəsimi r.,3,155.5,12.0,20.0
4,367983,AZN,Sea Breeze,3,127.3,,


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4068 entries, 0 to 4067
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Price         4068 non-null   Int64  
 1   Currency      4068 non-null   object 
 2   Location      4068 non-null   object 
 3   Rooms         4068 non-null   Int64  
 4   Size          4068 non-null   float64
 5   Floor Number  3258 non-null   float64
 6   Total Floors  3258 non-null   float64
dtypes: Int64(2), float64(3), object(2)
memory usage: 230.5+ KB


In [21]:
df["Currency"].value_counts()

AZN    4068
Name: Currency, dtype: int64

In [24]:
df.columns

Index(['Price', 'Currency', 'Location', 'Rooms', 'Size', 'Floor'], dtype='object')

In [25]:
df['Price'] = df['Price'].str.replace(" ", "")

In [26]:
df.head()

Unnamed: 0,Price,Currency,Location,Rooms,Size,Floor
0,750000,AZN,Nəsimi r.,5,217.0,11/16
1,269000,AZN,Yasamal r.,3,116.0,16/25
2,216000,AZN,İnşaatçılar m.,3,125.0,8/18
3,370000,AZN,Nəsimi r.,3,155.5,12/20
4,367983,AZN,Sea Breeze,3,127.3,


In [27]:
df['Price'] = df['Price'].astype('Int64')

In [28]:
df['Rooms'] = df['Rooms'].astype('Int64')

In [30]:
df['Size'] = df['Size'].astype('float64')

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4068 entries, 0 to 4067
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Price     4068 non-null   Int64  
 1   Currency  4068 non-null   object 
 2   Location  4068 non-null   object 
 3   Rooms     4068 non-null   Int64  
 4   Size      4068 non-null   float64
 5   Floor     3258 non-null   object 
dtypes: Int64(2), float64(1), object(3)
memory usage: 198.8+ KB


# **Data imputation**

In [35]:
df_numeric = df.select_dtypes(exclude = "object")

In [36]:
df_numeric.head()

Unnamed: 0,Price,Rooms,Size,Floor Number,Total Floors
0,750000,5,217.0,11.0,16.0
1,269000,3,116.0,16.0,25.0
2,216000,3,125.0,8.0,18.0
3,370000,3,155.5,12.0,20.0
4,367983,3,127.3,,


In [37]:
# create an object for KNNImputer
imputer = KNNImputer(n_neighbors=2)
df_imputer = imputer.fit_transform(df_numeric)
df_imputer.shape

(4068, 5)

In [57]:
df = pd.DataFrame(df_imputer, columns = df_numeric.columns)
df.head()

Unnamed: 0,Price,Rooms,Size,Floor Number,Total Floors
0,750000.0,5.0,217.0,11.0,16.0
1,269000.0,3.0,116.0,16.0,25.0
2,216000.0,3.0,125.0,8.0,18.0
3,370000.0,3.0,155.5,12.0,20.0
4,367983.0,3.0,127.3,4.0,12.0


In [69]:
for column in df.columns:
  df[column] = (df[column] - df[column].min())/ (df[column].max() - df[column].min())

In [70]:
df.head()

Unnamed: 0,Price,Rooms,Size,Floor Number,Total Floors
0,0.089723,0.4,0.23175,0.384615,0.483871
1,0.029949,0.2,0.114716,0.576923,0.774194
2,0.023363,0.2,0.125145,0.269231,0.548387
3,0.0425,0.2,0.160487,0.423077,0.612903
4,0.04225,0.2,0.12781,0.115385,0.354839


# **Creating the model**

In [79]:
X = df[['Rooms', 'Size', 'Floor Number', 'Total Floors']]
y = df['Price']

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## **Linear regression**

In [73]:
model = LinearRegression()

In [74]:
model.fit(X_train, y_train)

In [75]:
y_pred = model.predict(X_test)

In [76]:
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

In [77]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r_squared}")

Mean Squared Error: 0.0014732494938916835
R-squared: 0.4116767293278977


## **Random forest regression**

In [93]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [94]:
y_pred = model.predict(X_test)

In [95]:
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

In [96]:
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r_squared}")

Mean Squared Error: 0.001376754348970566
R-squared: 0.45021082657303413
