In [1]:
import pandas as pd

df_loc = pd.read_csv("../data/uk_rent_clean_rooms_price_location.csv")

df_loc.head()


Unnamed: 0,Location,Rooms_numeric,Price_numeric
0,"Birmingham, West Midla...",1.0,625
1,"Birmingham, West Midla...",1.0,625
2,"London, Greater London...",3.0,6000
3,"London, Greater London...",2.0,1400
4,"London, Greater London...",2.0,1400


In [2]:
print("df_loc info:")
df_loc.info()

print("\n The number of the unique locations:", df_loc["Location"].nunique())

print("\nTop 20 most frequent locations:")
print(df_loc["Location"].value_counts().head(20))


df_loc info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38952 entries, 0 to 38951
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       38952 non-null  object 
 1   Rooms_numeric  38952 non-null  float64
 2   Price_numeric  38952 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 913.1+ KB

 The number of the unique locations: 6

Top 20 most frequent locations:
Location
London, Greater London                           34321
Birmingham, West Midlands                         3314
Liverpool, Merseyside                             1104
Harrow, Harrow                                     142
Leeds, Leeds                                        70
Bromley, Bromley                                     1
Name: count, dtype: int64


In [3]:
#features before encoding
X_raw = df_loc[["Location", "Rooms_numeric"]].copy()

y = df_loc["Price_numeric"]

print("X_raw shape: ", X_raw.shape)
print("y shape: ", y.shape)

X_raw.head()

X_raw shape:  (38952, 2)
y shape:  (38952,)


Unnamed: 0,Location,Rooms_numeric
0,"Birmingham, West Midla...",1.0
1,"Birmingham, West Midla...",1.0
2,"London, Greater London...",3.0
3,"London, Greater London...",2.0
4,"London, Greater London...",2.0


In [6]:
#To remove and clean the whitespaces
df_loc["Location"] = df_loc["Location"].str.strip()


In [7]:
X = pd.get_dummies(
    X_raw,
    columns=["Location"],
    drop_first = True
)

print("X shape after encoding:", X.shape)
X.head()

X shape after encoding: (38952, 6)


Unnamed: 0,Rooms_numeric,"Location_ Bromley, Bromley","Location_ Harrow, Harrow","Location_ Leeds, Leeds","Location_ Liverpool, Merseyside","Location_ London, Greater London"
0,1.0,False,False,False,False,False
1,1.0,False,False,False,False,False
2,3.0,False,False,False,False,True
3,2.0,False,False,False,False,True
4,2.0,False,False,False,False,True


In [9]:
print("Columns in X:")
print(X.columns.tolist())


Columns in X:
['Rooms_numeric', 'Location_                        Bromley, Bromley                    ', 'Location_                        Harrow, Harrow                    ', 'Location_                        Leeds, Leeds                    ', 'Location_                        Liverpool, Merseyside                    ', 'Location_                        London, Greater London                    ']


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size= 0.2,
    random_state= 42
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train size: 31161
Test size: 7791
Train shape: (31161, 6)
Test shape: (7791, 6)


In [11]:
from sklearn.linear_model import LinearRegression

lin_reg_loc = LinearRegression()
lin_reg_loc.fit(X_train, y_train)


print("Intercept (bias):", lin_reg_loc.intercept_)
print("\nCoefficients for each feature:")

for name, coef in zip(X.columns, lin_reg_loc.coef_):
    print(f"{name}: {coef}")



Intercept (bias): 410.61931082233104

Coefficients for each feature:
Rooms_numeric: 383.9542158472629
Location_                        Bromley, Bromley                    : 0.0
Location_                        Harrow, Harrow                    : -120.00512483902222
Location_                        Leeds, Leeds                    : 260.5638257886113
Location_                        Liverpool, Merseyside                    : -794.5359023523466
Location_                        London, Greater London                    : 1583.683695707935


In [12]:
from sklearn.metrics import mean_squared_error, r2_score

#Predicting on the testing set
y_pred = lin_reg_loc.predict(X_test)

# Metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f} £")
print(f"R^2:  {r2:.3f}")

RMSE: 2121.31 £
R^2:  0.081


## Model with Rooms + Location

### Data used
- Dataset: `uk_rent_clean_rooms_price_location.csv`
- Number of rows: ~38,952
- Features:
  - `Rooms_numeric`
  - One-hot encoded `Location` (5 dummy columns, Birmingham as base)
- Target:
  - `Price_numeric` – monthly rent in £

### Model
- Model type: **Linear Regression**
- Train/test split: 80% train, 20% test (`random_state=42`)
- Learned coefficient for rooms:
  - `Rooms_numeric` ≈ 383.95 £ per additional room (keeping location fixed)
- Location effects (relative to **Birmingham**):
  - London ≈ +1583.68 £
  - Liverpool ≈ –794.54 £
  - Leeds ≈ +260.56 £
  - Harrow ≈ –120.01 £
  - Bromley ≈ 0 £ (only 1 sample)

### Performance (on test set)
- **RMSE** ≈ 2121.31 £  
- **R²** ≈ 0.081

### Interpretation
- Adding **Location** improved the model:
  - RMSE dropped from ~2199 £ => ~2121 £
  - R² increased from ~0.013 => ~0.081
- The model has learned sensible patterns:
  - London is more expensive than Birmingham,
  - Liverpool tends to be cheaper, etc.
- However, the overall R² is still low, suggesting that:
  - Number of rooms + coarse location alone are **not enough** to accurately predict rent.
  - Other factors (more detailed location, property type, description, etc.) likely play a large role.
