In [None]:
# Airbnb Price Prediction Project

In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
# Step 2: Load the dataset
df = pd.read_csv('Airbnb_data - airbnb_data.csv')
print("Initial shape:", df.shape)
df.head()

In [None]:
# Step 3: Data Exploration
print("
Missing Values:
", df.isnull().sum())
print("
Data Types:
", df.dtypes)
print("
Descriptive Stats:
", df.describe())

In [None]:
# Step 4: Handle Missing Values
df = df.dropna(subset=['price'])  # Drop rows where target is missing
df.fillna(method='ffill', inplace=True)  # Forward fill others

In [None]:
# Step 5: Feature Selection
# Drop ID-like columns or irrelevant ones (assuming column names)
df = df.drop(columns=['id', 'name', 'host_name', 'last_review'], errors='ignore')

In [None]:
# Step 6: Convert price to numeric (if needed)
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)

In [None]:
# Step 7: One-hot Encoding for categorical columns
categorical_cols = df.select_dtypes(include='object').columns
encoded_df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:
# Step 8: Train-test Split
X = encoded_df.drop('price', axis=1)
y = encoded_df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 9: Model Training - Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [None]:
# Step 10: Model Evaluation - Linear Regression
print("
Linear Regression:")
print("MAE:", mean_absolute_error(y_test, y_pred_lr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print("R2 Score:", r2_score(y_test, y_pred_lr))

In [None]:
# Step 11: Model Training - Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
# Step 12: Model Evaluation - Random Forest
print("
Random Forest:")
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print("R2 Score:", r2_score(y_test, y_pred_rf))

In [None]:
# Step 13: Feature Importance
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.nlargest(10).plot(kind='barh')
plt.title('Top 10 Important Features')
plt.show()
