In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('Kangaroo.csv') # Load a CSV

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns


In [None]:
df_columns = df.columns.tolist()
df_columns

In [None]:
df.info() # Summary info of the data

In [None]:
# Check for missing values
df.isnull().sum()

### sumary
- Shape: 80,368 x 53
- Some columns are likely unnecessary: url, id?, unnamed, 
- Many columns misses a lot information:
100%: monthlyCost, hasBalcony, accessibleDisabledPeople

## Data cleaning
1. Remove duplicate : No duplicates in data set
2. Remove irrelevant columns: URL, Unnamed:0, 
3. Remove columns missing a lot info


In [None]:
df.duplicated().sum() # Check for duplicate rows


In [None]:
# 1. Drop duplicates
#df_cleaned = df.drop_duplicates()
#df_cleaned.shape

In [None]:
# 2. Drop irrelevant columns
df_cleaned = df.drop(columns=["Unnamed: 0", "url"])
df_cleaned.shape
df_cleaned.head()


In [None]:
#3. Trim Whitespace in Strings

str_cols = df_cleaned.select_dtypes(include='object').columns #Selects columns that are of type string or mixed object
for col in str_cols:
    df_cleaned[col] = df_cleaned[col].apply(lambda x: x.strip() if isinstance(x, str) else x)

df_cleaned.describe(include='all') # Summary statistics of the data



In [None]:
# check for missing values 
#  Percentage of missing values
missing_percent = df_cleaned.isnull().mean().sort_values(ascending=False) * 100
missing_percent

In [None]:
# 4. Handle Missing Values
# Drop columns with over 70% missing
df_cleaned = df_cleaned.drop(columns=missing_percent[missing_percent > 70].index)



In [None]:
df_cleaned.head()

In [None]:
df_cleaned.isnull().mean().sort_values(ascending=False) * 100

In [None]:
df_cleaned.columns.tolist()

### 2. Data Analysis
- Linear correlation: 

In [None]:
df_cleaned.shape

In [None]:
# Compute Pearson correlation of every numeric column with price
corr_with_price = df_cleaned.corr(numeric_only=True)['price'].sort_values(ascending=False)

# View top positive and negative correlations
print("Top positive correlations:\n", corr_with_price.head(10))
print("\nTop negative correlations:\n", corr_with_price.tail(10))

In [None]:
numeric_df = df_cleaned.select_dtypes(include=[np.number])

# 2) Compute the correlation matrix
corr_matrix = numeric_df.corr()

In [None]:
fig, ax = plt.subplots(figsize=(20,16))
cax = ax.matshow(corr_matrix.values, cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax, shrink=0.8)

# tick labels
ticks = np.arange(len(corr_matrix.columns))
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(corr_matrix.columns, rotation=90)
ax.set_yticklabels(corr_matrix.columns)

# manual annotation loop
for (i, j), val in np.ndenumerate(corr_matrix.values):
    ax.text(j, i, f"{val:.2f}", ha='center', va='center', fontsize=7)

plt.title("Fully Annotated Correlation Matrix", pad=20)
plt.tight_layout()
plt.show()

In [None]:
most_influential = corr_with_price.abs().sort_values(ascending=False).index[1:6]
least_influential = corr_with_price.abs().sort_values(ascending=True).index[:5]

print("Top 5 most influential:", most_influential.tolist())
print("Top 5 least influential:", least_influential.tolist())

In [None]:
# Quantitative = numeric types
quant_cols = df_cleaned.select_dtypes(include=['number']).columns.tolist()
# Qualitative = object / category types
qual_cols = df_cleaned.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Quantitative ({len(quant_cols)}): {quant_cols}")
print(f"Qualitative ({len(qual_cols)}): {qual_cols}")

In [None]:
missing_count = df_cleaned.isnull().sum()
missing_pct   = (missing_count / len(df_cleaned) * 100).round(2)

missing_summary = pd.DataFrame({
    'missing_count': missing_count,
    'missing_pct'  : missing_pct
}).sort_values('missing_pct', ascending=False)

print(missing_summary.head(10))  # top 10 most-missing

In [None]:
#Histogram of price
plt.hist(df_cleaned['price'], bins=50)
plt.title("Price Distribution")
plt.xlabel("Price")
plt.ylabel("Count")
plt.show()

In [None]:
# Scatter price vs. habitableSurface
plt.scatter(df_cleaned['habitableSurface'], df_cleaned['price'], alpha=0.3)
plt.title("Price vs. Habitable Surface")
plt.xlabel("Surface (m²)")
plt.ylabel("Price")
plt.show()

In [None]:
#Boxplot to spot outliers in price by province
sns.boxplot(x='province', y='price', data=df_cleaned)
plt.xticks(rotation=45)
plt.title("Price by Province")
plt.show()

### Non-linear correlation
- Spearman Rank Correlation

In [None]:
df_cleaned.corr(method='spearman')['price'].sort_values(ascending=False)

In [None]:
spearman_corr = numeric_df.corr(method='spearman')
spearman_price_corr = spearman_corr['price'].drop('price').sort_values(ascending=False)

# Visualize
plt.figure(figsize=(10, 6))
sns.barplot(x=spearman_price_corr.values, y=spearman_price_corr.index, palette='viridis')
plt.title("Spearman Correlation with Price")
plt.xlabel("Spearman correlation coefficient")
plt.tight_layout()
plt.show()

In [None]:
#Scatterplots for Visual Inspection
# For a few features (pick ones with low Pearson but possible importance)
features_to_check = ['buildingConstructionYear', 'terraceSurface', 'landSurface']

for feature in features_to_check:
    plt.figure(figsize=(6, 4))
    sns.scatterplot(x=numeric_df[feature], y=numeric_df['price'])
    plt.title(f"Scatterplot: {feature} vs Price")
    plt.xlabel(feature)
    plt.ylabel('Price')
    plt.tight_layout()
    plt.show()

In [None]:
df_cleaned['bedroomCount'].value_counts()
#df.iloc[[df['bedroomCount'].idxmax()]]

In [None]:
df_cleaned['bathroomCount'].value_counts()



In [None]:
df_cleaned.iloc[[df_cleaned['bathroomCount'].idxmax()]]