In [227]:
import pandas as pd
import numpy as np
from scipy import stats
from ydata_profiling import ProfileReport
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import plotly.graph_objects as go
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder


DATASET : House sales in King County USA
I try to find the best model regression for this data set
i started with Linear reg, decision tree reg, random forrest reg and finish with k-Nearest Neighbors (k-NN)


In [228]:

df = pd.read_csv(r'C:\Users\Hp\Desktop\dataset\Machine learning\streamlit machine learning checkpoint\kc_house_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
report_1 = ProfileReport(df, title = 'KC House')
report_1

In [235]:
df_encoded = df.copy()

In [236]:
df_encoded.drop(columns=['sqft_lot','waterfront', 'view', 'sqft_basement', 'yr_renovated','sqft_lot15'], inplace=True)

In [None]:
report_2 = ProfileReport(df_encoded, title = 'KC House')
report_2

In [None]:
def Plots(df) :
    figure = make_subplots(rows=11, cols=2)
    for i, column in enumerate(df.columns) :
        row = (i//2) + 1
        col = (i%2) + 1
        figure.add_trace(go.Box(y=df[column], name= column),row = row, col = col)
    figure.update_layout(width=1000, height=1500, showlegend=True) 
    # Adjust height to fit all plots 
    figure.show()

Plots(df_encoded)

In [None]:
df_encoded


In [240]:
df_encoded.drop(columns=['id', 'bedrooms'],inplace=True)

In [241]:
df_encoded['date_only'] = df_encoded['date'].str.slice(0,4)
df_encoded.drop(columns=['date'], inplace=True)

In [None]:
df_encoded['date_only'] = pd.to_numeric(df_encoded['date_only'], errors='coerce')
df_encoded.info()

In [None]:
# Z-scores for each feature
df_z_scores = df_encoded.apply(stats.zscore)
# Identify outliers (Z-score > 3 or < -3)
outliers = (df_z_scores > 3) | (df_z_scores < -3)
#store outliers in df_outliers
df_outliers = df_z_scores[outliers.any(axis=1)]
#drop outliers from data with ~ operator 
df_cleaned = df_z_scores[~outliers.any(axis=1)]

df_cleaned.info()

In [None]:
#Heatmap to check correlations 
correlation_stars = sns.heatmap(df_encoded.corr(), annot=True, cmap='coolwarm')
correlation_stars

In [None]:
report_stars_3 = ProfileReport(df_cleaned, title = 'KC House')
report_stars_3

In [None]:
correlation_matrix = df_cleaned.corr(method='pearson') 
correlation_matrix

In [None]:
%matplotlib inline
plt.figure(figsize=(10, 8)) 
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0) 
plt.title('Correlation Matrix Heatmap') 

In [None]:
df_cleaned.drop(columns=['sqft_living', 'sqft_living15'], inplace=True)

In [None]:
correlation_matrix = df_cleaned.corr(method='pearson') 
%matplotlib inline
plt.figure(figsize=(10, 8)) 
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0) 
plt.title('Correlation Matrix Heatmap') 

**Model selection**

In [250]:
# Split the dataset
X = df_cleaned.drop(columns=['price'])
y = df_cleaned['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

In [252]:
y_pred = model.predict(X_test)

In [None]:
print("Mean Squared Error:", mean_squared_error(y_test, y_pred)) 
print("R-squared Score:", r2_score(y_test, y_pred))

**Decision Tree Regression**

In [None]:
tree = DecisionTreeRegressor(criterion='friedman_mse', max_depth=100, random_state=42) 
tree.fit(X_train, y_train)

In [255]:
y_pred_dtr = tree.predict(X_test)

In [None]:
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_dtr)) 
print("R-squared Score:", r2_score(y_test, y_pred_dtr))

In [None]:
rf = RandomForestRegressor(n_estimators=1000, criterion='squared_error', max_depth=100, random_state=42) 
rf.fit(X_train, y_train)

In [258]:
y_pred_rf = rf.predict(X_test) 

In [None]:
print("Random Forest Mean Squared Error:", mean_squared_error(y_test, y_pred_rf)) 
print("Random Forest R-squared Score:", r2_score(y_test, y_pred_rf))

**k-Nearest Neighbors (k-NN)**

In [None]:

knn = KNeighborsRegressor(n_neighbors=5) 
knn.fit(X_train, y_train)

In [261]:
y_pred_knn = knn.predict(X_test)

In [None]:
mse_knn = mean_squared_error(y_test, y_pred)
r2_knn = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse_knn)
print("R-squared Score:", r2_knn)


**Scatter plot Actual and Predicted Values**

In [None]:
# Plot actual vs. predicted values 
plt.figure(figsize=(10, 6)) 
sns.scatterplot(x=y_test, y=y_pred_rf, alpha=0.5) 
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2) 
plt.xlabel('Actual Values') 
plt.ylabel('Predicted Values') 
plt.title('Actual vs. Predicted Values') 
plt.show()

In [None]:
# Plot residuals 
residuals = y_test - y_pred_rf 
plt.figure(figsize=(10, 6)) 
sns.histplot(residuals, kde=True) 
plt.xlabel('Residuals') 
plt.title('Distribution of Residuals')
plt.show()

**The dataset is approximately normaly distributed** 

**The model fit the data well**