In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('Kangaroo.csv')

In [None]:
df_cleaned = df.drop(columns=["Unnamed: 0", "url"])
str_cols = df_cleaned.select_dtypes(include='object').columns #Selects columns that are of type string or mixed object
for col in str_cols:
    df_cleaned[col] = df_cleaned[col].apply(lambda x: x.strip() if isinstance(x, str) else x)

df_cleaned.describe(include='all') # Summary statistics of the data

In [None]:
#Delete all rows wihout price
df_cleaned = df_cleaned.dropna(subset=['price'])

### How to Decide: Keep or Drop the swimmingPool Column?
1. Missing values: 97.740394
2. Filter the top 10% most expensive houses
3. Do Spearman correlation
4. Compare Mean Price: compare average price for properties with and without a swimming pool
5. Test Statistical Significance: if p_value < 0.05, then the price difference is statistically significant → keep the feature.
Conclusion: should keep it


### Filter the top 10% most expensive houses:

Selects the top 10% most expensive properties (based on the 90th percentile of price).

Counts how many of those have hasSwimmingPool = True, and how many are missing (NaN).

In [None]:
#Filter the top 10% most expensive houses:
top_10 = df_cleaned[df_cleaned['price'] >= df_cleaned['price'].quantile(0.90)]
top_10['hasSwimmingPool'].value_counts(dropna=False)

In [None]:
### Do Spearman correlation

In [None]:
df_cleaned['hasSwimmingPool'] = df_cleaned['hasSwimmingPool'].fillna(0) # filling missing values (NaNs) in hasSwimmingPool with 0
df_cleaned['swimmingPool_missing'] = df['hasSwimmingPool'].isna().astype(int) # new column that explicitly tracks which values were originally missing: 1: missing

print(df_cleaned[['price', 'hasSwimmingPool', 'swimmingPool_missing']].corr(method='spearman'))

In [None]:
# Compare Mean Price
df_swim = df_cleaned.copy()
df_swim['swimmingPool_filled'] = df_swim['hasSwimmingPool'].fillna(0)

# Group by 0 and 1
df_swim.groupby('swimmingPool_filled')['price'].mean()

In [None]:
# make a boxplot
sns.boxplot(data=df_swim, x='swimmingPool_filled', y='price')
plt.title("Price Distribution: With vs. Without Swimming Pool")
plt.xlabel("Has Swimming Pool (0=No, 1=Yes)")
plt.ylabel("Price")
plt.show()

In [None]:
# statistical test: t-test
from scipy.stats import ttest_ind

has_pool = df_swim[df_swim['swimmingPool_filled'] == 1]['price']
no_pool = df_swim[df_swim['swimmingPool_filled'] == 0]['price']

t_stat, p_value = ttest_ind(has_pool, no_pool, equal_var=False, nan_policy='omit')
print(f"T-test p-value: {p_value}")