In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datasets import load_dataset
import seaborn as sns
import ast
import math




In [3]:
df = pd.read_csv(
    r"C:\Users\Administrator\Documents\CSV Files\global_house_purchase_dataset.csv"
)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   property_id              200000 non-null  int64  
 1   country                  200000 non-null  object 
 2   city                     200000 non-null  object 
 3   property_type            200000 non-null  object 
 4   furnishing_status        200000 non-null  object 
 5   property_size_sqft       200000 non-null  int64  
 6   price                    200000 non-null  int64  
 7   constructed_year         200000 non-null  int64  
 8   previous_owners          200000 non-null  int64  
 9   rooms                    200000 non-null  int64  
 10  bathrooms                200000 non-null  int64  
 11  garage                   200000 non-null  int64  
 12  garden                   200000 non-null  int64  
 13  crime_cases_reported     200000 non-null  int64  
 14  lega

In [5]:
df_copy = df.copy()

In [6]:
df_copy

Unnamed: 0,property_id,country,city,property_type,furnishing_status,property_size_sqft,price,constructed_year,previous_owners,rooms,...,customer_salary,loan_amount,loan_tenure_years,monthly_expenses,down_payment,emi_to_income_ratio,satisfaction_score,neighbourhood_rating,connectivity_score,decision
0,1,France,Marseille,Farmhouse,Semi-Furnished,991,412935,1989,6,6,...,10745,193949,15,6545,218986,0.16,1,5,6,0
1,2,South Africa,Cape Town,Apartment,Semi-Furnished,1244,224538,1990,4,8,...,16970,181465,20,8605,43073,0.08,9,1,2,0
2,3,South Africa,Johannesburg,Farmhouse,Semi-Furnished,4152,745104,2019,5,2,...,21914,307953,30,2510,437151,0.09,6,8,1,0
3,4,Germany,Frankfurt,Farmhouse,Semi-Furnished,3714,1110959,2008,1,3,...,17980,674720,15,8805,436239,0.33,2,6,6,0
4,5,South Africa,Johannesburg,Townhouse,Fully-Furnished,531,99041,2007,6,3,...,17676,65833,25,8965,33208,0.03,3,3,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,199996,Germany,Berlin,Villa,Fully-Furnished,685,203328,1968,1,3,...,78330,104050,15,17670,99278,0.01,8,4,5,1
199996,199997,China,Shenzhen,Townhouse,Unfurnished,3818,1454627,1977,5,7,...,25400,1175297,20,2865,279330,0.34,7,10,9,1
199997,199998,Japan,Kyoto,Villa,Semi-Furnished,3603,1619147,1990,2,4,...,28220,743049,30,5595,876098,0.17,5,3,9,0
199998,199999,South Africa,Johannesburg,Apartment,Unfurnished,1706,306165,2010,0,4,...,12240,150774,15,16300,155391,0.11,6,10,6,0


In [7]:
# Problem 1: Market Affordability by Country
# Which countries offer the most affordable properties when price is normalized by property size and customer income?
# Step 3: Create derived metrics (core learning step)
# Price per Square Foot

df_copy["price_per_sqft"] = df_copy["price"] / df_copy["property_size_sqft"]

# Column-wise operations
# Vectorized arithmetic (no loops)

In [8]:
df_copy['price_per_sqft']

0         416.685166
1         180.496785
2         179.456647
3         299.127356
4         186.517891
             ...    
199995    296.829197
199996    380.991881
199997    449.388565
199998    179.463658
199999    200.629244
Name: price_per_sqft, Length: 200000, dtype: float64

In [9]:
# 3.2 Price-to-Income Ratio

df_copy["price_to_income_ratio"] = df_copy["price"] / df_copy["customer_salary"]

# Key concept: Pandas handles row-level math automatically.

In [10]:
# Problem 1: Market Affordability by Country
# Which countries offer the most affordable properties when price is normalized by property size and customer income?
# Step 4: Aggregate by Country
# Why: Reporting = grouped summaries

country_affordability = (
    df_copy.groupby("country")
    .agg(
        avg_price_per_sqft=("price_per_sqft", "mean"),
        avg_price_to_income=("price_to_income_ratio", "mean"),
        avg_salary=("customer_salary", "mean"),
    )
    .reset_index()
)

# Named aggregations
# Clean output formatting

In [11]:
# Output
country_affordability

Unnamed: 0,country,avg_price_per_sqft,avg_price_to_income,avg_salary
0,Australia,320.005783,26.029335,55246.344062
1,Brazil,200.006833,41.384339,21937.720205
2,Canada,350.010676,28.216718,55182.932926
3,China,379.992591,31.185831,54905.907891
4,France,420.019822,34.351656,54988.787113
5,Germany,299.984573,24.608471,55039.591121
6,India,150.009025,60.488816,11016.191639
7,Japan,449.996153,37.056781,54613.85095
8,Singapore,699.961154,57.719955,54877.393638
9,South Africa,180.010247,36.731516,21968.630479


In [12]:
# Step 5: Sort for Insights

country_affordability.sort_values(by="avg_price_to_income")
# This produces your affordability ranking.

Unnamed: 0,country,avg_price_per_sqft,avg_price_to_income,avg_salary
5,Germany,299.984573,24.608471,55039.591121
0,Australia,320.005783,26.029335,55246.344062
2,Canada,350.010676,28.216718,55182.932926
3,China,379.992591,31.185831,54905.907891
11,UK,399.983922,33.044174,55104.828716
4,France,420.019822,34.351656,54988.787113
9,South Africa,180.010247,36.731516,21968.630479
7,Japan,449.996153,37.056781,54613.85095
12,USA,499.985016,41.384286,54950.747333
1,Brazil,200.006833,41.384339,21937.720205


In [13]:
# Step 6: Sanity Check (Professional Habit)

country_affordability.describe()
# Ensures no extreme or illogical values

Unnamed: 0,avg_price_per_sqft,avg_price_to_income,avg_salary
count,13.0,13.0,13.0
mean,380.765788,38.535093,46527.087702
std,160.273232,11.308583,16293.584072
min,150.009025,24.608471,11016.191639
25%,299.984573,31.185831,54613.85095
50%,379.992591,36.731516,54950.747333
75%,449.996153,41.384339,55039.591121
max,699.961154,60.488816,55246.344062


In [None]:
# Step 7: Interpretation (Mandatory)
# “Countries with lower average price-to-income ratios indicate stronger affordability, even when absolute prices vary.”

In [None]:
# Problem 1 (Extended): Affordability by Property Size Category
# Refined analytical question: Within each country, how does affordability differ across property size categories?
# This answers: Are smaller properties truly more affordable? Do some countries price larger properties more efficiently?

In [14]:
# Create property_size_category
# using Pandas approach: pd.cut() — which is the correct tool

bins = [0, 1000, 2500, 6000]
labels = ["Small", "Medium", "Large"]

df_copy["property_size_category"] = pd.cut(
    df_copy["property_size_sqft"], bins=bins, labels=labels, include_lowest=True
)

# Using pd.cut() is the correct tool
# It converts a continuous numeric variable into categorical segments
# it is vectorized (fast, clean)
# it preserves analytical intent
# it is widely used in real-world analytics
# this is exactly the right function for this task.

In [15]:
# Validate the new column (mandatory)
df_copy["property_size_category"].value_counts()

property_size_category
Large     124770
Medium     53887
Small      21343
Name: count, dtype: int64

In [16]:
# Problem 1 (Extended): Affordability by Property Size Category
# Step 1: Group by Country and Size Category
size_affordability = (
    df_copy.groupby(["country", "property_size_category"], observed=True)
    .agg(
        avg_price_per_sqft=("price_per_sqft", "mean"),
        avg_price_to_income=("price_to_income_ratio", "mean"),
    )
    .reset_index()
)

In [17]:
# Step 2: Compare Within Countries
# This reveals: Which size category is most affordable per country
size_affordability.sort_values(
    by=["country", "avg_price_to_income"], ascending=[True, True]
)

Unnamed: 0,country,property_size_category,avg_price_per_sqft,avg_price_to_income
0,Australia,Small,319.934945,5.799779
1,Australia,Medium,320.017569,14.281337
2,Australia,Large,320.012457,34.363731
3,Brazil,Small,200.037519,9.225889
4,Brazil,Medium,200.016798,22.723482
5,Brazil,Large,199.997353,54.856122
6,Canada,Small,350.155071,6.197419
7,Canada,Medium,349.99536,15.522699
8,Canada,Large,349.992,37.724496
9,China,Small,379.875934,6.790699


In [18]:
# Step 3: Sanity Check
size_affordability.head(10)

# Confirm:
# Values make sense
# No unexpected spikes

Unnamed: 0,country,property_size_category,avg_price_per_sqft,avg_price_to_income
0,Australia,Small,319.934945,5.799779
1,Australia,Medium,320.017569,14.281337
2,Australia,Large,320.012457,34.363731
3,Brazil,Small,200.037519,9.225889
4,Brazil,Medium,200.016798,22.723482
5,Brazil,Large,199.997353,54.856122
6,Canada,Small,350.155071,6.197419
7,Canada,Medium,349.99536,15.522699
8,Canada,Large,349.992,37.724496
9,China,Small,379.875934,6.790699


In [19]:
# Problem 2: Purchase Decision Drivers
# What financial and affordability factors are associated with a customer’s decision to purchase a property?

# Understand the Decision Variable
# Why: Always validate the target variable first.

df_copy["decision"].value_counts(normalize=True)

# what you learn: Overall purchase rate.

decision
0    0.76966
1    0.23034
Name: proportion, dtype: float64

In [20]:
# Step 2: Buyers vs Non-Buyers Comparison

decision_summary = df_copy.groupby("decision").agg(
    avg_salary=("customer_salary", "mean"),
    avg_price_to_income=("price_to_income_ratio", "mean"),
    avg_emi_ratio=("emi_to_income_ratio", "mean"),
    avg_down_payment=("down_payment", "mean"),
)


decision_summary

Unnamed: 0_level_0,avg_salary,avg_price_to_income,avg_emi_ratio,avg_down_payment
decision,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,45126.489736,41.734389,0.21414,459271.235052
1,51213.735087,27.653271,0.132727,443362.640879


In [None]:
# Step 3: Interpret the Differences (Critical Thinking)
# You are looking for:
# Lower ratios for buyers
# Higher salaries for buyers
# Higher down payments for buyers

# if you see these patterns, your analysis is behaving logically

In [21]:
# Step 4: EMI-to-Income Segmentation (Important Learning Step)
# Why: Raw ratios are hard to interpret operationally

# Create EMI bands using pd.cut()

bins = [0, 0.30, 0.45, 1.0, df["emi_to_income_ratio"].max()]
labels = ["Low", "Medium", "High", "Extreme"]

df_copy["emi_band"] = pd.cut(
    df["emi_to_income_ratio"], bins=bins, labels=labels, include_lowest=True
)

In [22]:
# Step 5: Purchase Rate by EMI Band

emi_conversion = (
    df_copy.groupby(["country", "emi_band"], observed=True)["decision"]
    .mean()
    .reset_index(name="purchase_rate")
)

# Important concept
# Since decision 0/1:
# Mean = purchase rate
# This is a key analytics trick

In [23]:
# Output
emi_conversion

Unnamed: 0,country,emi_band,purchase_rate
0,Australia,Low,0.274791
1,Australia,Medium,0.213282
2,Australia,High,0.0
3,Australia,Extreme,0.0
4,Brazil,Low,0.205444
5,Brazil,Medium,0.153535
6,Brazil,High,0.0
7,Brazil,Extreme,0.0
8,Canada,Low,0.275009
9,Canada,Medium,0.183406


In [24]:
# Size Category & Decision
size_conversion = (
    df_copy.groupby("property_size_category", observed=True)["decision"]
    .mean()
    .reset_index(name="purchase_rate")
)

# Shows whether certain property sizes convert better.
# Output
size_conversion

Unnamed: 0,property_size_category,purchase_rate
0,Small,0.258164
1,Medium,0.256797
2,Large,0.214154


In [25]:
# Problem 3: Location Quality & Customer Satisfaction
# Step 1: Validate Relevant Columns
# Step 2: Aggregate City-level Metrics
# Why we do this:
# Operations reporting compares entities, not individuals.
# Cities must be summarized using averages.

city_satisfaction = (
    df_copy.groupby(["country", "city"])
    .agg(
        avg_satisfaction=("satisfaction_score", "mean"),
        avg_neighbourhood=("neighbourhood_rating", "mean"),
        avg_connectivity=("connectivity_score", "mean"),
        property_count=("property_id", "count"),
    )
    .reset_index()
)

In [26]:
# Output
city_satisfaction

Unnamed: 0,country,city,avg_satisfaction,avg_neighbourhood,avg_connectivity,property_count
0,Australia,Brisbane,5.512473,5.557174,5.476352,5011
1,Australia,Melbourne,5.47041,5.493855,5.485914,5289
2,Australia,Sydney,5.449825,5.445158,5.384481,5142
3,Brazil,Rio de Janeiro,5.521984,5.519105,5.498953,7642
4,Brazil,São Paulo,5.488975,5.458543,5.503933,7755
5,Canada,Montreal,5.518584,5.56162,5.487285,5112
6,Canada,Toronto,5.411545,5.453507,5.496651,5076
7,Canada,Vancouver,5.530788,5.491272,5.477652,5213
8,China,Beijing,5.544294,5.524587,5.586149,5328
9,China,Shanghai,5.515646,5.463237,5.492417,5209


In [27]:
# Step 3: Filter for Reliable Cities (Optional filtering)
# Why this matters
# Cities with very few listings can distort insights

city_satisfaction[city_satisfaction["property_count"] >= 1000]

Unnamed: 0,country,city,avg_satisfaction,avg_neighbourhood,avg_connectivity,property_count
0,Australia,Brisbane,5.512473,5.557174,5.476352,5011
1,Australia,Melbourne,5.47041,5.493855,5.485914,5289
2,Australia,Sydney,5.449825,5.445158,5.384481,5142
3,Brazil,Rio de Janeiro,5.521984,5.519105,5.498953,7642
4,Brazil,São Paulo,5.488975,5.458543,5.503933,7755
5,Canada,Montreal,5.518584,5.56162,5.487285,5112
6,Canada,Toronto,5.411545,5.453507,5.496651,5076
7,Canada,Vancouver,5.530788,5.491272,5.477652,5213
8,China,Beijing,5.544294,5.524587,5.586149,5328
9,China,Shanghai,5.515646,5.463237,5.492417,5209


In [28]:
# Use a Data-Driven Threshold (If You Must)
# If a stakeholder insists on a rule, you derive it from the data, not arbitrarily:


city_satisfaction["property_count"].describe()

# Then choose something like:
# Then choose something like:Bottom 5th percentile
# Or a clearly justified operational threshold
# But again: your dataset does not require this.

count       40.000000
mean      5000.000000
std       2312.347359
min       2510.000000
25%       3103.250000
50%       5075.500000
75%       5253.000000
max      15278.000000
Name: property_count, dtype: float64

In [29]:
# Step 4: Identify Top-Performing Cities (Required)
# Purpose:
# Operations and reporting analysts are expected to:
# Rank entities
# Surface top and bottom performers
# Support prioritization

top_cities = city_satisfaction.sort_values(by="avg_satisfaction", ascending=False)

# Output
top_cities

Unnamed: 0,country,city,avg_satisfaction,avg_neighbourhood,avg_connectivity,property_count
20,India,Hyderabad,5.619603,5.39401,5.502139,2571
19,India,Delhi,5.604336,5.478126,5.535811,2583
29,UAE,Abu Dhabi,5.580757,5.492671,5.519456,7504
36,USA,Houston,5.565188,5.440814,5.570524,2999
28,South Africa,Johannesburg,5.549922,5.589212,5.478345,7712
31,UK,Birmingham,5.548412,5.388818,5.497586,3935
8,China,Beijing,5.544294,5.524587,5.586149,5328
12,France,Marseille,5.543168,5.505255,5.456644,5328
7,Canada,Vancouver,5.530788,5.491272,5.477652,5213
17,India,Bangalore,5.52501,5.502908,5.570764,2579


In [30]:
# Step 5: Analyze Contribution Relationships
# Why this is important
# This is where the analysis moves from descriptive to diagnostic.
# We ask: When satisfaction is high, are neigbhourhood and connectivity also high?

city_satisfaction[["avg_satisfaction", "avg_neighbourhood", "avg_connectivity"]].corr()

# How an Analyst Interprets This
# You are not predicting, you are diagnosing:
# Moderate correlation → contributing factor
# Weak correlation → low influence
# This is diagnostic analytics, which is expected even in reporting roles.

Unnamed: 0,avg_satisfaction,avg_neighbourhood,avg_connectivity
avg_satisfaction,1.0,-0.127385,0.26994
avg_neighbourhood,-0.127385,1.0,-0.027737
avg_connectivity,0.26994,-0.027737,1.0


In [None]:
# Problem 4: Market Risk & Price Stability
# Business Framing
# Operational question: Which countries show higher price volatility, indicating higher market risk?
# This supports:
# Risk monitoring
# Market entry decisions
# Reporting stability metrics

# Step 1: Aggregate Country-Level Price Metrics

country_price_stats = (
    df_copy.groupby("country")
    .agg(
        avg_price=("price", "mean"),
        price_std=("price", "std"),
        min_price=("price", "min"),
        max_price=("price", "max"),
        property_count=("property_id", "count"),
    )
    .reset_index()
)

# Output
country_price_stats

# Why we do this
# std = volatility
# min/max = spread context
# property_count = reliability

Unnamed: 0,country,avg_price,price_std,min_price,max_price,property_count
0,Australia,1028264.0,515328.3,123623,1924825,15442
1,Brazil,640083.3,322486.5,75838,1203161,15397
2,Canada,1112176.0,565000.0,137664,2102863,15401
3,China,1219038.0,612256.1,149336,2282320,15536
4,France,1343234.0,678644.9,163842,2523371,15628
5,Germany,958093.3,483117.4,116321,1803663,15408
6,India,477798.0,241828.2,56288,904447,15357
7,Japan,1437475.0,730950.2,175735,2703971,15317
8,Singapore,2239027.0,1126847.0,275233,4202732,15278
9,South Africa,573335.4,291943.0,68096,1083082,15401


In [32]:
# Step 2: Normalize Volatility (Critical Step)
# Raw standard deviation is misleading across price levels.

country_price_stats["price_volatility_ratio"] = (
    country_price_stats["price_std"] / country_price_stats["avg_price"]
)

# Why this matters
# This makes volatility comparable across countries
# This is professional-level reporting

In [33]:
# Step 3: Filter for Data Reliability (Optional)

country_price_stats = country_price_stats[country_price_stats["property_count"] >= 50]

In [None]:
# Output
country_price_stats

Unnamed: 0,country,avg_price,price_std,min_price,max_price,property_count,price_volatility_ratio
0,Australia,1028264.0,515328.3,123623,1924825,15442,0.501163
1,Brazil,640083.3,322486.5,75838,1203161,15397,0.50382
2,Canada,1112176.0,565000.0,137664,2102863,15401,0.508013
3,China,1219038.0,612256.1,149336,2282320,15536,0.502245
4,France,1343234.0,678644.9,163842,2523371,15628,0.505232
5,Germany,958093.3,483117.4,116321,1803663,15408,0.504249
6,India,477798.0,241828.2,56288,904447,15357,0.506131
7,Japan,1437475.0,730950.2,175735,2703971,15317,0.508496
8,Singapore,2239027.0,1126847.0,275233,4202732,15278,0.503275
9,South Africa,573335.4,291943.0,68096,1083082,15401,0.509201


In [None]:
# Step 4: Rank Countries by Market Risk

price_risk_markets = country_price_stats.sort_values(
    by="price_volatility_ratio", ascending=False
)

# Output
price_risk_markets

Unnamed: 0,country,avg_price,price_std,min_price,max_price,property_count,price_volatility_ratio
10,UAE,1907340.0,971331.1,237071,3603554,15141,0.50926
9,South Africa,573335.4,291943.0,68096,1083082,15401,0.509201
7,Japan,1437475.0,730950.2,175735,2703971,15317,0.508496
2,Canada,1112176.0,565000.0,137664,2102863,15401,0.508013
6,India,477798.0,241828.2,56288,904447,15357,0.506131
4,France,1343234.0,678644.9,163842,2523371,15628,0.505232
5,Germany,958093.3,483117.4,116321,1803663,15408,0.504249
1,Brazil,640083.3,322486.5,75838,1203161,15397,0.50382
8,Singapore,2239027.0,1126847.0,275233,4202732,15278,0.503275
3,China,1219038.0,612256.1,149336,2282320,15536,0.502245


In [None]:
# Step 5: Interpret Market Risk
# How an analyst explains this:
# High volatility ratio → unstable pricing → higher risk
# Low volatility ratio → predictable market → operational stability

# This is not prediction, it is risk profiling, which is correct for a reporting.

In [None]:
# Problem 4: Risk & Market Stability Analysis

country_risk = (
    df_copy.groupby("country")
    .agg(
        avg_crime=("crime_cases_reported", "mean"),
        avg_legal_cases=("legal_cases_on_property", "mean"),
        purchase_rate=("decision", "mean"),
    )
    .reset_index()
)

# Sorted the values ascending order
country_risk.sort_values(by='purchase_rate')

Unnamed: 0,country,avg_crime,avg_legal_cases,purchase_rate
12,USA,1.984294,0.249656,0.175774
1,Brazil,1.99948,0.244203,0.179515
9,South Africa,2.006168,0.249529,0.182196
8,Singapore,1.009818,0.25756,0.214033
6,India,0.991274,0.248746,0.217686
10,UAE,1.004623,0.25005,0.233406
7,Japan,0.997389,0.251094,0.245544
4,France,0.99744,0.247057,0.248848
11,UK,0.992993,0.246221,0.25472
2,Canada,1.001818,0.246672,0.257061


In [38]:
# Step 2: Normalize Risk Interpretation (High vs Low Risk)
# Why:
# Raw crime and legal counts are hard to interpret operationally.
# We need relative comparison, not raw numbers.
# Create Risk Bands (Optional but Strong)

country_risk["crime_risk_level"] = pd.qcut(
    country_risk["avg_crime"], q=3, labels=["Low", "Medium", "High"]
)

country_risk["legal_risk_level"] = pd.qcut(
    country_risk["avg_legal_cases"], q=3, labels=["Low", "Medium", "High"]
)

# Why qcut()
# Divides markets into equal-sized risk groups
# Common in reporting and policy analysis
# Avoids arbitrary thresholds

In [39]:
# Step 3: Analyze Risk Impact on Purchase Decisions
# Key Question:
# Do higher crime or legal risks correspond to lower purchase rates?

# Python Correlation Analysis

risk_correlation = df_copy[
    ["crime_cases_reported", "legal_cases_on_property", "decision"]
].corr()


risk_correlation

# How To Interprets This:
# Negative correlation with decision: → Risk discourages purchases
# Weak or no correlation: → Buyers may tolerate or ignore certain risks
# Stronger legal correlation than crime: → Legal clarity matters more than safety perception
# This is diagnostic analytics, not prediction.

Unnamed: 0,crime_cases_reported,legal_cases_on_property,decision
crime_cases_reported,1.0,-0.001188,-0.16608
legal_cases_on_property,-0.001188,1.0,-0.314936
decision,-0.16608,-0.314936,1.0


In [None]:
# Step 4: Identify High-Risk but High-Conversion Markets
# Why This Is Critical
# These markets:
# Look attractive on the surface
# Carry hidden operational or legal exposure
# Require policy, compliance, or mitigation strategies

operational_risk_markets = country_risk[
    (country_risk["crime_risk_level"] == "High")
    | (country_risk["legal_risk_level"] == "High")
].sort_values(by="purchase_rate", ascending=False)

operational_risk_markets

# Now:
# Both datasets coexist
# Each has a clear analytical meaning
# This notebook reads like a report, not a script
# This is exactly how analysts should name variables.

Unnamed: 0,country,avg_crime,avg_legal_cases,purchase_rate,crime_risk_level,legal_risk_level
7,Japan,0.997389,0.251094,0.245544,Low,High
10,UAE,1.004623,0.25005,0.233406,Medium,High
8,Singapore,1.009818,0.25756,0.214033,High,High
9,South Africa,2.006168,0.249529,0.182196,High,Medium
1,Brazil,1.99948,0.244203,0.179515,High,Low
12,USA,1.984294,0.249656,0.175774,High,High


In [43]:
# Step 5: (Optional Drill-Down) City-Level Risk Analysis
# If needed for stronger portfolio depth:

city_risk = (
    df_copy.groupby(["country", "city"])
    .agg(
        avg_crime=("crime_cases_reported", "mean"),
        avg_legal=("legal_cases_on_property", "mean"),
        purchase_rate=("decision", "mean"),
        property_count=("property_id", "count"),
    )
    .reset_index()
)

# Output
city_risk.sort_values(by=['purchase_rate', 'property_count'])

Unnamed: 0,country,city,avg_crime,avg_legal,purchase_rate,property_count
39,USA,San Francisco,1.970893,0.267464,0.172057,3092
35,USA,Chicago,2.017058,0.247827,0.173157,3107
37,USA,Los Angeles,1.92,0.248667,0.176,3000
36,USA,Houston,2.025675,0.252084,0.176059,2999
3,Brazil,Rio de Janeiro,1.990055,0.242345,0.178749,7642
4,Brazil,São Paulo,2.008769,0.246035,0.180271,7755
28,South Africa,Johannesburg,2.012967,0.250908,0.180887,7712
38,USA,New York,1.987026,0.232241,0.181641,3083
27,South Africa,Cape Town,1.99935,0.248147,0.183509,7689
18,India,Chennai,0.989243,0.245817,0.205578,2510


In [None]:
# (Advanced but Valuable) Combined Risk View
# Once variables are clearly separated, you can do something very strong for a portfolio:
# Identify markets that are risky on multiple dimensions

combined_risk_markets = price_risk_markets[["country", "price_volatility_ratio"]].merge(
    operational_risk_markets[
        ["country", "avg_crime", "avg_legal_cases", "purchase_rate"]
    ],
    on="country",
    how="inner",
)

combined_risk_markets.sort_values(by='purchase_rate')

Unnamed: 0,country,price_volatility_ratio,avg_crime,avg_legal_cases,purchase_rate
5,USA,0.500867,1.984294,0.249656,0.175774
3,Brazil,0.50382,1.99948,0.244203,0.179515
1,South Africa,0.509201,2.006168,0.249529,0.182196
4,Singapore,0.503275,1.009818,0.25756,0.214033
0,UAE,0.50926,1.004623,0.25005,0.233406
2,Japan,0.508496,0.997389,0.251094,0.245544
