In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# 1. Load the dataset
df = pd.read_csv('data/listings.csv')

# 2. Data Cleaning
# Remove '$' and ',' from the 'price' column and convert it to float
df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)

# 3. Feature Selection (Simple Model)
# We select only numerical columns for now to train our first model
# 'accommodates': Number of people the place accommodates
# 'bedrooms': Number of bedrooms
# 'price': The target variable to predict
ml_data = df[['accommodates', 'bedrooms', 'price']].dropna()

# Display the first 5 rows to check data
ml_data.head()

Unnamed: 0,accommodates,bedrooms,price
0,4,1.0,105.0
1,7,4.0,180.0
2,1,0.0,70.0
3,4,1.0,90.0
4,2,0.0,25.0


In [5]:
# Define Features (X) and Target (y)
X = ml_data[['accommodates', 'bedrooms']]  # Questions (Inputs)
y = ml_data['price']                        # Answers (Output)

# Split the data into Training and Testing sets
# test_size=0.2: Keep 20% of the data for testing (the exam)
# random_state=42: Ensures the split is the same every time we run this code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the shapes of the split datasets
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (7052, 2)
Test set size: (1764, 2)


In [6]:
# 1. Initialize the Model (Architecture Selection)
# We choose Linear Regression because we want to predict a continuous value (Price).
model = LinearRegression()

# 2. Train the Model (Fitting)
# The model learns the relationship between Features (X_train) and Target (y_train).
# It calculates the best coefficients (weights) to minimize error.
model.fit(X_train, y_train)

print("Training completed successfully!")

# 3. Inspect the Learned Parameters (Optional but Recommended)
# Let's see how much price the model assigns to each feature based on math.
print(f"Base Price (Intercept): {model.intercept_:.2f} €")  # Starting price (if inputs were 0)
print(f"Price per Accommodate: {model.coef_[0]:.2f} €")     # Value of +1 person capacity
print(f"Price per Bedroom: {model.coef_[1]:.2f} €")         # Value of +1 bedroom

Training completed successfully!
Base Price (Intercept): 23.37 €
Price per Accommodate: 33.16 €
Price per Bedroom: 18.62 €


In [7]:
# 6. Make Predictions (The Exam)
# We give the unseen test questions (X_test) to the model.
# The model uses its learned formula to guess the prices (y_pred).
y_pred = model.predict(X_test)

# 7. Evaluate the Model (The Scorecard)
# We compare the Guessed Prices (y_pred) with the Actual Prices (y_test).
# MAE (Mean Absolute Error): On average, how many Euros are we off?
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f} €")

# Let's compare the first 5 predictions side-by-side
results = pd.DataFrame({'Actual Price': y_test, 'Predicted Price': y_pred})
print("\n--- First 5 Comparisons ---")
print(results.head())

Mean Absolute Error (MAE): 64.95 €

--- First 5 Comparisons ---
       Actual Price  Predicted Price
6611           70.0       108.320116
10860         380.0       444.041000
7667           55.0       108.320116
12628         205.0       174.647850
13277          55.0       174.647850
