# خواندن داده ها

In [1]:
import pandas as pd
df = pd.read_csv('Divar.csv')
cc = pd.read_csv('iran_city_classification.csv')

  df = pd.read_csv('/ds/Divar_real_state.csv')


# بارگذاری ماژول ها 

In [2]:
import numpy as np
from scipy.stats import ttest_ind, mannwhitneyu, levene, shapiro
import independent_groups_continues_dependent as i

In [3]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
cc.columns = [c.replace('\u200c', ' ') for c in cc.columns]
cc['دسته بندی'] = cc['دسته بندی'].str.replace('\u200c', ' ', regex=False)

## سوال 1: مقایسه میانگین مساحت مساکن در کلان شهر ها و در شهرهای کوچک/روستا ها

In [6]:
df_1 = df.copy()
df_1.rename(columns={'city_slug': 'نام شهر'}, inplace=True)
df_1 = df_1.dropna(subset=['building_size'])
df_1 = df_1.merge(cc, on='نام شهر', how='left')

metro = df_1[df_1['دسته بندی'] == 'کلان شهر']
small = df_1[df_1['دسته بندی'] != 'کلان شهر']

#two groups are independent and have ordinal or continues dependent value
#so we use python file for this situation called: independent_groups_continues_dependent.py

col = 'building_size'

i.compare_two_groups(metro[col], small[col])

=== Hypotheses ===
H0: Mean(Group1) = Mean(Group2)
H1: Mean(Group1) != Mean(Group2) (alternative='two-sided')

Sample sizes: Group1=455121, Group2=525273

Large sample sizes → use t-test via CLT, skip Shapiro-Wilk

=== Welch's t-test Result ===
t-statistic: -14.1441, p-value: 0.0000
Decision: Reject H0




## سوال 2: مقیسه میانگین مساحت خانه های قدیمی و جدید

### آماده سازی داده

In [7]:
df_2 = df[df['cat3_slug'].isin(['villa', 'apartment-sell', 'apartment-rent', 'plot-old', 'house-villa-sell', 'house-villa-rent', 'suite-apartment'])]
df_2 = df_2.dropna(subset=['construction_year'])

df_2['construction_year'] = (
    df_2['construction_year']
    .replace({'قبل از ۱۳۷۰': '1369'})
)


def persian_to_english_number(s):
    if pd.isna(s):  # handle NaNs
        return s
    persian_digits = '۰۱۲۳۴۵۶۷۸۹'
    english_digits = '0123456789'
    for p, e in zip(persian_digits, english_digits):
        s = s.replace(p, e)
    return s
    
    
df_2['construction_year'] = df_2['construction_year'].apply(persian_to_english_number)
df_2['land_size'].fillna(df_2['building_size'], inplace=True)
df_2 = df_2.dropna(subset=['land_size'])
old = df_2[df_2['construction_year'].astype(int) < 1396]
new = df_2[df_2['construction_year'].astype(int) >= 1396]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_2['land_size'].fillna(df_2['building_size'], inplace=True)


In [8]:
col = 'land_size'
i.compare_two_groups(new[col], old[col])

=== Hypotheses ===
H0: Mean(Group1) = Mean(Group2)
H1: Mean(Group1) != Mean(Group2) (alternative='two-sided')

Sample sizes: Group1=349160, Group2=352492

Large sample sizes → use t-test via CLT, skip Shapiro-Wilk

=== Welch's t-test Result ===
t-statistic: 3.2179, p-value: 0.0013
Decision: Reject H0




## سوال 3: مقایسه میانگین قیمت در املاک تجاری دارای سند تجاری و املاک تجاری بدون سند تجاری

### آماده سازی داده

In [None]:
df_3 = df[df['cat2_slug'] == 'commercial-sell']
df_3.dropna(subset=['has_business_deed', 'price_value'], inplace=True)
hbd = df_3[df_3['has_business_deed'] == True] #has business deed
dhbd = df_3[df_3['has_business_deed'] == False] #does'nt have business deed

In [None]:
col = 'price_value'
i.compare_two_groups(hbd[col], dhbd[col])

##  سوال 4: مقایسه میانگین میزان تاثیر وجود امکانات لاکچری و غیر لاکچری

In [9]:
import pandas as pd

# List of features
features = [
    'has_pool','has_jacuzzi','has_sauna','has_barbecue',
    'has_balcony','has_elevator','has_warehouse','has_parking',
    'has_water', 'has_warm_water_provider','has_electricity','has_gas','has_heating_system',
    'has_cooling_system','has_restroom','has_security_guard'
]


# Normalize mixed feature columns to boolean/NA (reuse previous function)
def normalize_feature(col):
    return col.apply(lambda x: True if str(x).strip().lower() == 'true'
                              else False if str(x).strip().lower() == 'false'
                              else pd.NA)

for f in features:
    df[f] = normalize_feature(df[f])

# Create a summary dictionary
feature_presence = {}

for feature in features:
    # Get property types where at least one True exists
    present_in = df[df[feature] == True]['cat3_slug'].unique().tolist()
    feature_presence[feature] = present_in

# Convert to DataFrame for nicer display
feature_presence_df = pd.DataFrame(
    list(feature_presence.items()), 
    columns=['feature', 'property_types_with_True']
)

# Display all rows
pd.set_option('display.max_rows', None)
print(feature_presence_df)

# Define luxury features
luxury_features = ['has_pool','has_barbecue','has_sauna','has_jacuzzi']
non_luxury_features = [f for f in feature_presence_df['feature'] if f not in luxury_features]

                    feature                                                                             property_types_with_True
0                  has_pool                                                          [house-villa-sell, villa, house-villa-rent]
1               has_jacuzzi                                                          [house-villa-sell, villa, house-villa-rent]
2                 has_sauna                                                          [house-villa-sell, villa, house-villa-rent]
3              has_barbecue                                                          [villa, house-villa-sell, house-villa-rent]
4               has_balcony                       [apartment-sell, apartment-rent, house-villa-sell, house-villa-rent, plot-old]
5              has_elevator                                           [apartment-sell, apartment-rent, office-rent, office-sell]
6             has_warehouse  [apartment-sell, apartment-rent, house-villa-sell, house-villa-rent,

In [12]:
feature_presence_df.to_csv('feature_presence.csv', index=False)

### house-villa-sell بررسی 

In [10]:
# بررسی تاثیر بودن یا نبودن ویژگی های لاکچری 
df_have_lux = df[df[luxury_features].any(axis=1) == True]
df_have_lux.dropna(subset='price_value', inplace=True)
df_doesnt_have_lux = df[~df[luxury_features].any(axis=1)]
df_doesnt_have_lux.dropna(subset='price_value', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_have_lux.dropna(subset='price_value', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_doesnt_have_lux.dropna(subset='price_value', inplace=True)


In [11]:
col = 'price_value'
i.compare_two_groups(df_have_lux[col], df_doesnt_have_lux[col])

=== Hypotheses ===
H0: Mean(Group1) = Mean(Group2)
H1: Mean(Group1) != Mean(Group2) (alternative='two-sided')

Sample sizes: Group1=6291, Group2=562055

Large sample sizes → use t-test via CLT, skip Shapiro-Wilk

=== Welch's t-test Result ===
t-statistic: -1.5539, p-value: 0.1203
Decision: Do not reject H0




In [None]:
# بررسی تاثیر بودن یا نبودن ویژگی های غیر لاکچری
df_have_non_lux = df[df[non_luxury_features].any(axis=1) == True]
df_have_non_lux.dropna(subset='price_value', inplace=True)
df_doesnt_have_non_lux = df[~df[non_luxury_features].any(axis=1)]
df_doesnt_have_non_lux.dropna(subset='price_value', inplace=True)

In [None]:
col = 'price_value'
i.compare_two_groups(df_have_non_lux[col], df_doesnt_have_non_lux[col])

In [5]:
df[df['cat3_slug'] == 'villa']['price_value'].info()

<class 'pandas.core.series.Series'>
Index: 12899 entries, 0 to 999860
Series name: price_value
Non-Null Count  Dtype  
--------------  -----  
1 non-null      float64
dtypes: float64(1)
memory usage: 201.5 KB


In [None]:
df_rent = df[df['cat3_slug'].isin(['villa', 'house-villa-rent'])]
df_rent['rent_value'] = df_rent['credit_value'] + df_rent['rent_value'] * (100/3)