In [1]:
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import requests
from scipy.stats import norm
import json

import warnings
warnings.filterwarnings('ignore')

In [2]:
DATA_DIR = '/Users/andishetavakkoli/Documents/notebook/github_project/machine-learning-projects-data/'

In [3]:
df = pd.read_csv(DATA_DIR + 'tehranhouseprice.csv')

In [4]:
df.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


In [5]:
df.dropna(subset='Address', inplace=True)

In [6]:
 df['Address'].isna().sum()

0

## Check the data type

In [7]:
[item for item in df['Area'] if not item.isdigit()]

[' 3,310,000,000 ',
 ' 16,160,000,000 ',
 ' 1,000 ',
 ' 8,400,000,000 ',
 ' 3,600 ',
 ' 2,550,000,000 ']

In [8]:
df['Area'] = df['Area'].str.replace(',', '').str.strip()

In [9]:
df['Area'] = df['Area'].astype('float')

## change True | False to 0, 1 for ['Parking', 'Warehouse', 'Elevator']

In [10]:
df[['Parking', 'Warehouse', 'Elevator']] = df[['Parking', 'Warehouse', 'Elevator']].astype('int64')   

In [16]:
df = df.drop(columns='Price(USD)')

In [17]:
df_num = df.select_dtypes(include='number')
px.scatter_matrix(df_num,width=800, height=1000)

## helper function

In [11]:
def write_json(file_path, data):
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=4)
            

In [47]:
write_json('district.json', district)

In [50]:
def read_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
        return data
        

In [51]:
district = read_json('district.json')

{'Shahran': [35.77, 51.28],
 'Qalandari': [35.7886, 51.4415],
 'Lavasan': [35.8186, 51.6226],
 'Pardis': [35.75, 51.77],
 'Amir Bahador': [35.6711, 51.4034],
 'Shahrake Qods': [35.76, 51.37],
 'Ekhtiarieh': [35.786, 51.4581],
 'Shahrake Gharb': [35.76, 51.37],
 'Ekbatan': [35.7091, 51.3026],
 'North Program Organization': [35.73, 51.3],
 'Absard': [35.6195, 52.1509],
 'Andisheh': [35.75, 51.27],
 'Haft Tir': [35.7161, 51.4254],
 'West Ferdows Boulevard': [35.73, 51.29],
 'Mahallati': [35.6757, 51.5143],
 'Narmak': [35.73, 51.5],
 'Ozgol': [35.7907, 51.5101],
 'Zafar': [35.76, 51.42],
 'Tajrish': [35.7995, 51.43],
 'Islamshahr': [35.54, 51.22],
 'Dorous': [35.773, 51.455],
 'Pirouzi': [35.69, 51.46],
 'Abazar': [35.7326, 51.3256],
 'Shahrake Shahid Bagheri': [35.76, 51.2],
 'Koohsar': [35.7643, 51.2835],
 'Moniriyeh': [35.68, 51.4],
 'Parastar': [35.6816, 51.4834],
 'Saadat Abad': [35.78, 51.373],
 'Majidieh': [35.735, 51.4626],
 'Amirieh': [35.627, 51.029],
 'Southern Chitgar': [35.714

In [66]:
df['latitude'] = df['Address'].apply(lambda x: district[x][0])
df['longitude'] = df['Address'].apply(lambda x: district[x][1])

In [67]:
df.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD),lat,long,lon,latitude,longitude
0,63,1,True,True,True,Shahran,1850000000.0,61666.67,35.77,51.28,51.28,35.77,51.28
1,60,1,True,True,True,Shahran,1850000000.0,61666.67,35.77,51.28,51.28,35.77,51.28
2,79,2,True,True,True,Pardis,550000000.0,18333.33,35.75,51.77,51.77,35.75,51.77
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33,35.76,51.37,51.37,35.76,51.37
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33,35.76,51.37,51.37,35.76,51.37


## Get the region and give geographical coordination

In [71]:
import folium
from folium.plugins import MarkerCluster
import matplotlib.cm as cm
import matplotlib.colors as mcolors

def create_bubble_map(latitude_list, longitude_list, popup_text_list, price_list, latitude_center, longitude_center, zoom_start=12, marker_size=20, colormap='RdYlBu'):
    # Create map object
    map_fig = folium.Map(location=[latitude_center, longitude_center], zoom_start=zoom_start)
    
    # Create marker cluster object
    marker_cluster = MarkerCluster().add_to(map_fig)
    
    # Determine the maximum price for scaling the bubble size
    max_price = max(price_list)
    
    # Create a colormap for coloring the markers based on price
    colormap = cm.get_cmap(colormap)
    
    # Add markers to the map
    for lat, lon, popup_text, price in zip(latitude_list, longitude_list, popup_text_list, price_list):
        # Calculate the normalized size of the bubble based on price
        size = int(marker_size * (price / max_price))
        
        # Calculate the normalized color based on price
        normalized_price = price / max_price
        color = colormap(normalized_price)
        hex_color = mcolors.rgb2hex(color)
        
        # Create a circle marker with the calculated size and color
        folium.CircleMarker(
            [lat, lon],
            radius=size,
            popup=popup_text,
            fill=True,
            fill_opacity=0.7,
            color=hex_color,
            fill_color=hex_color
        ).add_to(marker_cluster)
    
    # Save the map as HTML
    map_fig.save("bubble_map.html")
    return map_fig

# Assume df is your pandas DataFrame containing latitude, longitude, median_house_value, and population columns
create_bubble_map(
    latitude_list=df['latitude'],
    longitude_list=df['longitude'],
    popup_text_list=df['Price'].astype(str),
    price_list=df['Price'],
    latitude_center=df['latitude'].mean(),
    longitude_center=df['longitude'].mean(),
    zoom_start=12,
    marker_size=20,
    colormap = cm.get_cmap('viridis')
)



# Data Processing

In [12]:
df.describe()

Unnamed: 0,Room,Price,Price(USD)
count,3456.0,3456.0,3456.0
mean,2.081308,5379957000.0,179331.9
std,0.759723,8121729000.0,270724.3
min,0.0,3600000.0,120.0
25%,2.0,1420000000.0,47333.33
50%,2.0,2900000000.0,96666.67
75%,2.0,6000000000.0,200000.0
max,5.0,92400000000.0,3080000.0


In [None]:
# Outlier Handeling

### Zscore

In [37]:
def detect_anomalies_z_score(data, threshold=3):
    '''
    Detects anomalies in a 1D array of data using the Z-score method.
    Returns a boolean array indicating which values are anomalies.
    
    Parameters:
        data (array-like): The input data to be analyzed.
        threshold (float): The number of standard deviations from the mean
            above which a value is considered an anomaly. Default is 3.
    
    Returns:
        A boolean array indicating which values are anomalies.
    '''
    mean = np.mean(data)
    std = np.std(data)
    z_scores = (data - mean) / std
    return np.abs(z_scores) > threshold

### IQR

In [36]:
def detect_anomalies_iqr(data, k=1.5):
    '''
    Detects anomalies in a 1D array of data using the IQR method.
    Returns a boolean array indicating which values are anomalies.
    
    Parameters:
        data (array-like): The input data to be analyzed.
        k (float): The multiplier for the IQR above which a value is
            considered an anomaly. Default is 1.5.
    
    Returns:
        A boolean array indicating which values are anomalies.
    '''
    quartile1, quartile3 = np.percentile(data, [25, 75])
    iqr = quartile3 - quartile1
    lower_bound = quartile1 - k * iqr
    upper_bound = quartile3 + k * iqr
    return (data < lower_bound) | (data > upper_bound)


### MAD

In [21]:
def detect_anomalies_mad(data, threshold=3.5):
    '''
    Detects anomalies in a 1D array of data using the MAD method.
    Returns a boolean array indicating which values are anomalies.
    
    Parameters:
        data (array-like): The input data to be analyzed.
        threshold (float): The number of MADs from the median above which a value is
            considered an anomaly. Default is 3.5.
    
    Returns:
        A boolean array indicating which values are anomalies.
    '''
    median = np.median(data)
    mad = np.median(np.abs(data - median))
    z_scores = norm.ppf(1 - 0.5 / len(data))
    threshold_mad = mad * threshold / z_scores
    return np.abs(data - median) > threshold_mad

In [23]:
detect_anomalies_mad(df['Price'], threshold=3.5).sum()

1785

In [42]:
for col in ['Price', 'Area']:
    print(col)
    print(f' number of outlier for with MAD method is {detect_anomalies_mad(df[col], threshold=3.5).sum()}')
    print(f' number of outlier with IQR method is {detect_anomalies_iqr(df[col]).sum()}')
    print(f' number of outlier with Zscore method is {detect_anomalies_z_score(df[col], threshold=3).sum()}')
    print(50* '-')
    

Price
 number of outlier for with MAD method is 1785
 number of outlier with IQR method is 310
 number of outlier with Zscore method is 79
--------------------------------------------------
Area
 number of outlier for with MAD method is 1758
 number of outlier with IQR method is 237
 number of outlier with Zscore method is 4
--------------------------------------------------


### Remove price outliers

In [48]:
df[detect_anomalies_z_score(df['Price'], threshold=3)].style.background_gradient(subset='Price')

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price
201,270.0,4,1,1,1,Farmanieh,37800000000.0
330,350.0,3,1,1,1,Zaferanieh,52500000000.0
339,335.0,4,1,1,1,Saadat Abad,33500000000.0
340,320.0,4,1,1,1,Saadat Abad,40000000000.0
349,312.0,4,1,1,1,Farmanieh,45000000000.0
413,280.0,4,1,1,1,Saadat Abad,38500000000.0
430,400.0,5,1,1,0,Lavasan,85000000000.0
431,660.0,5,1,1,0,Lavasan,55000000000.0
440,300.0,3,1,1,1,Niavaran,55500000000.0
459,245.0,3,1,1,1,Aqdasieh,36500000000.0


In [49]:
outlier_price_mask = detect_anomalies_z_score(df['Price'], threshold=3)

In [51]:
df = df[~outlier_price_mask]

In [52]:
df.shape

(3377, 7)

In [None]:
### Remove Area outliers

In [56]:
outlier_area_mask = detect_anomalies_z_score(df['Area'], threshold=3)

In [57]:
df = df[~outlier_area_mask]

In [58]:
df.shape

(3373, 7)