In [135]:
import pandas as pd
import numpy as np

In [136]:
path = "/content/drive/MyDrive/Data Science/Machine Learning A-Z™ AI, Python & R + ChatGPT Bonus [2023]/2. Machine Learning A-Z (Model Selection)/Regression/1. Boston House Price Prediction/1553768847_housing.xlsx"

In [137]:
dataset = pd.read_excel(path)

In [138]:
dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [139]:
dataset.shape

(20640, 10)

In [140]:
dataset.dtypes

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
ocean_proximity        object
median_house_value      int64
dtype: object

In [141]:
dataset.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [142]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
 9   median_house_value  20640 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [143]:
Column_Object = dataset.select_dtypes(include='object').columns
Column_Object

Index(['ocean_proximity'], dtype='object')

In [144]:
dataset.nunique()

longitude               844
latitude                862
housing_median_age       52
total_rooms            5926
total_bedrooms         1923
population             3888
households             1815
median_income         12928
ocean_proximity           5
median_house_value     3842
dtype: int64

In [145]:
# Explore distribution of categorical features
for col in Column_Object:
  print(dataset[col].value_counts())

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64


In [146]:
dataset.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'median_house_value'],
      dtype='object')

In [147]:
#



## **Handle Missing Values**

In [None]:
dataset.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [None]:
# Impute missing values with the median
dataset['total_bedrooms'].fillna(dataset['total_bedrooms'].median(), inplace=True)

In [None]:
dataset.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64

In [None]:
# Impute missing values with the most frequent category
dataset['ocean_proximity'].fillna(dataset['ocean_proximity'].mode()[0], inplace=True)

## **Encode Categorical Features**

**1. Label Encoding:**

In [None]:
# Assuming 'dataset' is your original DataFrame before one-hot encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# Apply LabelEncoder before creating dummy variables
dataset["ocean_proximity"] = le.fit_transform(dataset["ocean_proximity"])

# Now proceed with creating dummy variables if needed
dataset = pd.get_dummies(dataset, columns=['ocean_proximity'])

## **Feature Scaling**

**1. Standardization:** Scales features to have a mean of 0 and a standard deviation of 1.



In [None]:
from sklearn.preprocessing import StandardScaler
Scaler = StandardScaler()

In [None]:
Numeric_Features = dataset.select_dtypes(include=np.number).columns
Numeric_Features

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [None]:
dataset[Numeric_Features] = Scaler.fit_transform(dataset[Numeric_Features])

**2. Normalization (Min-Max Scaling):**

Scales features to a range between 0 and 1.

In [None]:
from sklearn.preprocessing import MinMaxScaler
Scaler = MinMaxScaler()
dataset[Numeric_Features] = Scaler.fit_transform(dataset[Numeric_Features])


In [None]:
dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_0,ocean_proximity_1,ocean_proximity_2,ocean_proximity_3,ocean_proximity_4
0,0.211155,0.567481,0.784314,0.022331,0.019863,0.008941,0.020556,0.539668,0.902266,False,False,False,True,False
1,0.212151,0.565356,0.392157,0.180503,0.171477,0.06721,0.186976,0.538027,0.708247,False,False,False,True,False
2,0.210159,0.564293,1.0,0.03726,0.02933,0.013818,0.028943,0.466028,0.695051,False,False,False,True,False
3,0.209163,0.564293,1.0,0.032352,0.036313,0.015555,0.035849,0.354699,0.672783,False,False,False,True,False
4,0.209163,0.564293,1.0,0.04133,0.043296,0.015752,0.042427,0.230776,0.674638,False,False,False,True,False


## **Splitting the Dataset**

In [None]:
from sklearn.model_selection import train_test_split

X = dataset.drop('median_house_value', axis=1)
y = dataset['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Model Selection**

**1. Linear Regression:** A simple and widely used linear model.

In [180]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()


**2. Decision Tree Regressor:**

In [167]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()

**3. Random Forest Regressor:**

In [186]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

**4. Support Vector Regressor (SVR)**

In [159]:
from sklearn.svm import SVR
model = SVR()

**5. Gradient Boosting Regressor:**

In [174]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()

**6. XGBoost Regressor**

In [192]:
# Import XGBoost Regressor
from xgboost import XGBRegressor

#Create a XGBoost Regressor
model = XGBRegressor()


## **Model Training:**

In [193]:
model.fit(X_train, y_train)

## **Model Evaluation:**

In [194]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.009637417969697096


In [195]:
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 0.09817035178554213


In [196]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

R-squared: 0.8270020752151273


In [199]:
# Print the first few elements
print("y_pred (first 5):", y_pred[:5])
print("y_test (first 5):", y_test.values[:5])

# Or, create a DataFrame for better visualization (if y_test is a Pandas Series)
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
display(comparison_df.head(10))

y_pred (first 5): [0.09042429 0.13354003 0.97080535 0.43221605 0.55613065]
y_test (first 5): [0.06742446 0.06350695 1.         0.41979415 0.54226787]


Unnamed: 0,Actual,Predicted
20046,0.067424,0.090424
3024,0.063507,0.13354
15663,1.0,0.970805
20484,0.419794,0.432216
9814,0.542268,0.556131
13311,0.296289,0.377745
7113,0.377732,0.576788
7668,0.293815,0.3753
18246,0.670102,0.584153
5723,0.889895,0.990579
