In [2]:
import sklearn 
from sklearn.linear_model import LinearRegression 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
from sklearn.metrics import mean_squared_error
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from datetime import datetime

 
SEED = 42

%matplotlib inline 

In [3]:
df = pd.read_csv("/kaggle/input/us-house-price-dataset/cleaned_dataset.csv")
df.head()

Unnamed: 0,Building Size,County,Land Size,Price,Year Built,bathroom(s),bedroom(s),livingroom(s)
0,149.85,Chatham County,1173.59,279900,1997,2,3,0
1,284.56,Stanislaus,2221.5,745000,2025,5,4,1
2,115.94,Monroe,2387.65,225000,1964,2,3,0
3,309.74,Wake,1214.06,724900,2014,4,5,0
4,165.0,Grant,809.37,247900,1970,3,3,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1582 entries, 0 to 1581
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Building Size  1582 non-null   float64
 1   County         1582 non-null   object 
 2   Land Size      1582 non-null   float64
 3   Price          1582 non-null   int64  
 4   Year Built     1582 non-null   int64  
 5   bathroom(s)    1582 non-null   int64  
 6   bedroom(s)     1582 non-null   int64  
 7   livingroom(s)  1582 non-null   int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 99.0+ KB


# Feature engineering : 

In [5]:
df["Rooms"] = df["bathroom(s)"] + df["bedroom(s)"] + df["livingroom(s)"]
df.head(2)

Unnamed: 0,Building Size,County,Land Size,Price,Year Built,bathroom(s),bedroom(s),livingroom(s),Rooms
0,149.85,Chatham County,1173.59,279900,1997,2,3,0,5
1,284.56,Stanislaus,2221.5,745000,2025,5,4,1,10


In [6]:
current_year = int(datetime.now().strftime('%Y'))

df["House Age"] = np.where(
    current_year - df["Year Built"] > 0,
    current_year - df["Year Built"], 
    0
)

df.head(2)

Unnamed: 0,Building Size,County,Land Size,Price,Year Built,bathroom(s),bedroom(s),livingroom(s),Rooms,House Age
0,149.85,Chatham County,1173.59,279900,1997,2,3,0,5,28
1,284.56,Stanislaus,2221.5,745000,2025,5,4,1,10,0


In [7]:
ohe = OneHotEncoder()
county_col = df["County"].values
county_col = county_col.reshape((-1,1))
county_col_encoded = ohe.fit_transform(county_col).toarray()
county_encoded_df = pd.DataFrame(county_col_encoded)
county_encoded_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,553,554,555,556,557,558,559,560,561,562
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df = pd.concat([df, county_encoded_df], axis=1)
df.drop("County", axis=1, inplace=True)
df.head(2)

Unnamed: 0,Building Size,Land Size,Price,Year Built,bathroom(s),bedroom(s),livingroom(s),Rooms,House Age,0,...,553,554,555,556,557,558,559,560,561,562
0,149.85,1173.59,279900,1997,2,3,0,5,28,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,284.56,2221.5,745000,2025,5,4,1,10,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df.to_csv("new_features_added.csv")

# Base Model : 
- I'm gonna use Linear Regression as the base model 

In [10]:
lreg = LinearRegression()

In [11]:
df.columns = df.columns.astype("str")

In [12]:
X, y = df.drop("Price", axis=1), df["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

X_train shape:  (1265, 571)
X_test shape:  (317, 571)
y_train shape:  (1265,)
y_test shape:  (317,)


In [13]:
lreg.fit(X_train, y_train)

train_score = lreg.score(X_train, y_train)
test_score = lreg.score(X_test, y_test)

print("Train Score: ", train_score)
print("Test Score: ", test_score)

Train Score:  0.5304578620083378
Test Score:  -54120415531.42872


In [14]:
preds = lreg.predict(X_test.iloc[:2, ])
real_vals = y_test.iloc[:2]

print("Predicted Values: ", preds)
print("-----------------")
print("Real Values: ", real_vals.values)

Predicted Values:  [-360197.04692602  519720.58037567]
-----------------
Real Values:  [299900 550000]


- It's obvious that the model is Underfitting : Data is too complexe to be captured by a simple linear regression model.

- Next I'm gonna try a polynomial regression model 

In [15]:
degree = 3

numeric_features = ["Building Size", "Land Size", "Year Built", "bathroom(s)", 
                    "bedroom(s)", "livingroom(s)", "Rooms", "House Age"]
categorical_features = [col for col in df.columns if not(col in numeric_features) and col != "Price"]



preprocessor = ColumnTransformer([
    ("poly", Pipeline([
     ("scaler", StandardScaler()),
     ("poly_features", PolynomialFeatures(degree=degree, include_bias=False))
    ]), numeric_features),
    ("cat", "passthrough", categorical_features)
])


pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

pipeline.fit(X_train, y_train)

train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)


print("Train R2: ", train_score)
print("Test R2: ", test_score)


Train R2:  0.8872602134428724
Test R2:  -451953.7172040948


- So we are dealing with an extreme overfitting case

- Let's how a tree base model with perform 

## Experimenting with tree based models : 

In [16]:
def rmse(y_true, y_pred): 
    assert len(y_true) == len(y_pred)
    n = float(len(y_true))
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse / n)
    return rmse

In [25]:
def grid_search(estimator, params):
    
    scaler = ColumnTransformer(
        transformers=[("numeric", StandardScaler(), numeric_features)],
        remainder = "passthrough"
    )
    
    search = GridSearchCV(
        estimator,
        param_grid=params,
        cv=5,
        scoring="r2",
        n_jobs=-1
    )

    pipeline = Pipeline([
        ("scaler", scaler),
        ("regressor", search)
    ])
    
    pipeline.fit(X_train, y_train)
    
    train_r2 = pipeline.score(X_train, y_train)
    test_r2 = pipeline.score(X_test, y_test)
    
    print("Train r2: ", train_r2)
    print("Test r2: ", test_r2)
    return pipeline
    


In [26]:
params = {"n_estimators": [100],
          "max_depth": range(5, 25),
          "min_samples_split": [2, 5, 10, 15],
          "min_samples_leaf": [1, 2, 5, 10]}

pipeline = grid_search(RandomForestRegressor(), params)

KeyboardInterrupt: 

In [22]:
params = {"max_depth": range(2, 40),
          "min_samples_split": [2, 5, 10, 15],
          "min_samples_leaf": [1, 2, 5, 10]}
estimator = DecisionTreeRegressor()


scaer = ColumnTransformer([
    ("numeric", StandardScaler(), numeric_features),
    ("cat", "passthrough")
])

search = GridSearchCV(
    estimator,
    param_grid=params,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

search.fit(X_train, y_train)

train_r2 = search.score(X_train, y_train)
test_r2 = search.score(X_test, y_test)

print("Train r2: ", train_r2)
print("Test r2: ", test_r2)

print("Best parameters: ", search.best_params_)

Train r2:  0.6911414847904478
Test r2:  -0.25361983681989564
Best parameters:  {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [27]:
# Save the base model :
import pickle

filename = "model.pkl"

with open(filename, 'wb') as f: 
    pickle.dump(pipeline, f)

print(f"Model saved to {filename}")

with open(filename, 'rb') as f: 
    model = pickle.load(f)

print(f"Model saved succefully {model}")

Model saved to model.pkl
Model saved succefully Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('poly',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler()),
                                                                  ('poly_features',
                                                                   PolynomialFeatures(degree=3,
                                                                                      include_bias=False))]),
                                                  ['Building Size', 'Land Size',
                                                   'Year Built', 'bathroom(s)',
                                                   'bedroom(s)',
                                                   'livingroom(s)', 'Rooms',
                                                   'House Age']),
                                       