In [None]:
# Step 1: Importing Required Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Lasso, LinearRegression, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Step 2: Load the Dataset
data = pd.read_csv('https://www.kaggle.com/anthonypino/melbourne-housing-market')

# Step 3: Exploratory Data Analysis
# Examining the dataset
print(data.head())

# Finding the shape, number of columns and size of the dataset
print("Shape:", data.shape)
print("Number of Columns:", len(data.columns))
print("Size:", data.size)

# Showing information of the dataset
print(data.info())

# Step 4: Data Preprocessing
# Clearing duplicate data
data = data.drop_duplicates()

# Clearing outlier data using z-score method
z_scores = (data[['Landsize', 'BuildingArea']] - data[['Landsize', 'BuildingArea']].mean()) / data[['Landsize', 'BuildingArea']].std()
data = data[(z_scores < 3).all(axis=1)]

# Removing missing values
categorical_columns = ['Bathroom', 'Car']
for column in categorical_columns:
    data[column] = data[column].fillna(data[column].mode().iloc[0])

# Step 5: Data Visualization
# Histogram of price distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Price'], kde=True)
plt.xlabel('Price')
plt.ylabel('Count')
plt.title('Price Distribution')
plt.show()

# Pair plot to visualize relationships between numerical variables and price
numerical_columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt']
sns.pairplot(data[numerical_columns + ['Price']])
plt.show()

# Correlation matrix heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data[numerical_columns + ['Price']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Encoding categorical variables
encoder = LabelEncoder()
data['Method'] = encoder.fit_transform(data['Method'])
data['Type'] = encoder.fit_transform(data['Type'])
data['Regionname'] = encoder.fit_transform(data['Regionname'])
data['CouncilArea'] = encoder.fit_transform(data['CouncilArea'])

# Step 6: Model Selection
# Splitting the data into train and test sets
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing models
models = {
    'Lasso': {
        'model': Lasso()
    },
    'LinearRegression': {
        'model': LinearRegression()
    },
    'Ridge': {
        'model': Ridge()
    },
    'ElasticNet': {
        'model': ElasticNet()
    },
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor()
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor()
    },
    'GradientBoostingRegressor': {
        'model': GradientBoostingRegressor()
    },
    'AdaBoostRegressor': {
        'model': AdaBoostRegressor(n_estimators=5, learning_rate=1.2, loss='exponential', random_state=2)
    }
}

# Training and evaluating models
results = {}
for name, model in models.items():
    model['model'].fit(X_train, y_train)
    y_pred = model['model'].predict(X_test)
    results[name] = {
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R2': r2_score(y_test, y_pred)
    }

# Step 7: Model Evaluation
# Comparing models
for name, metrics in results.items():
    print(f"{name}:")
    print("MAE:", metrics['MAE'])
    print("MSE:", metrics['MSE'])
    print("RMSE:", metrics['RMSE'])
    print("R2:", metrics['R2'])
    print()