In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('../data/train.csv')
test_data=pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')


In [3]:
df = train_data.copy()
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
label_col = 'SalePrice'
df = df[useful_features + [label_col]]
df.head()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
0,PConc,Gd,8,0,2008,856,208500
1,CBlock,TA,6,298,2007,1262,181500
2,PConc,Gd,6,0,2008,920,223500
3,BrkTil,Gd,7,0,2006,961,140000
4,PConc,Gd,9,192,2008,1145,250000


In [5]:
df.drop_duplicates()
df.tail()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
1455,PConc,TA,7,0,2007,953,175000
1456,CBlock,TA,7,349,2010,2073,210000
1457,Stone,Gd,9,0,2010,1188,266500
1458,CBlock,Gd,5,366,2010,1078,142125
1459,CBlock,TA,6,736,2008,1256,147500


In [6]:
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
1455,PConc,TA,7,0,2007,953,175000
1456,CBlock,TA,7,349,2010,2073,210000
1457,Stone,Gd,9,0,2010,1188,266500
1458,CBlock,Gd,5,366,2010,1078,142125
1459,CBlock,TA,6,736,2008,1256,147500


# #Scale continuous feautres

In [7]:
continuous_columns = df[useful_features].select_dtypes(include='number').columns
continuous_columns

Index(['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF'], dtype='object')

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df[continuous_columns])

In [9]:
scaled_columns = scaler.transform(df[continuous_columns])
scaled_columns

array([[ 0.91220977, -0.75217584,  0.13877749, -0.79343379],
       [-0.31868327,  1.62619479, -0.61443862,  0.25714043],
       [-0.31868327, -0.75217584,  0.13877749, -0.62782603],
       ...,
       [ 1.52765629, -0.75217584,  1.64520971,  0.06565646],
       [-0.93412978,  2.16891024,  1.64520971, -0.21898188],
       [-0.31868327,  5.12192075,  0.13877749,  0.2416147 ]])

In [10]:
continuous_features_df = pd.DataFrame(data=scaled_columns, columns=continuous_columns)
continuous_features_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
0,0.91221,-0.752176,0.138777,-0.793434
1,-0.318683,1.626195,-0.614439,0.25714
2,-0.318683,-0.752176,0.138777,-0.627826
3,0.296763,-0.752176,-1.367655,-0.521734
4,1.527656,0.780197,0.138777,-0.045611


## Categorical features

In [11]:
categorical_columns = df[useful_features].select_dtypes(include='object').columns
categorical_columns

Index(['Foundation', 'KitchenQual'], dtype='object')

In [12]:
categorical_features_df = pd.get_dummies(df[categorical_columns])
categorical_features_df.head()

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,0,0,1,0,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,1,0


In [13]:
final_df = continuous_features_df.join(categorical_features_df).join(df[label_col])
final_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,SalePrice
0,0.91221,-0.752176,0.138777,-0.793434,0,0,1,0,0,0,0,0,1,0,208500
1,-0.318683,1.626195,-0.614439,0.25714,0,1,0,0,0,0,0,0,0,1,181500
2,-0.318683,-0.752176,0.138777,-0.627826,0,0,1,0,0,0,0,0,1,0,223500
3,0.296763,-0.752176,-1.367655,-0.521734,1,0,0,0,0,0,0,0,1,0,140000
4,1.527656,0.780197,0.138777,-0.045611,0,0,1,0,0,0,0,0,1,0,250000


# Model training

In [14]:
X, y = final_df.drop(columns=[label_col]), final_df[label_col]
X, y = final_df.drop(columns=[label_col]), final_df[label_col]

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [16]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()


In [17]:
model.fit(X_train, y_train)

# Model evaluation

In [18]:
y_pred = model.predict(X_test)
y_pred[y_pred < 0] = 0

In [19]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)
compute_rmsle(y_test, y_pred)

0.22

## Testing dataframe equality

In [20]:
final_df.to_parquet('../data' + 'processed_df.parquet', index=False)

In [21]:
processed_df = pd.read_parquet('../data' + 'processed_df.parquet')
processed_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,SalePrice
0,0.91221,-0.752176,0.138777,-0.793434,0,0,1,0,0,0,0,0,1,0,208500
1,-0.318683,1.626195,-0.614439,0.25714,0,1,0,0,0,0,0,0,0,1,181500
2,-0.318683,-0.752176,0.138777,-0.627826,0,0,1,0,0,0,0,0,1,0,223500
3,0.296763,-0.752176,-1.367655,-0.521734,1,0,0,0,0,0,0,0,1,0,140000
4,1.527656,0.780197,0.138777,-0.045611,0,0,1,0,0,0,0,0,1,0,250000


In [22]:
pd.testing.assert_frame_equal(processed_df, final_df)

pd.testing.assert_frame_equal(processed_df, final_df.drop(columns=[label_col]))

# Step 1️⃣ : notebook reorganization

In [23]:
from sklearn.preprocessing import OneHotEncoder

In [24]:
cat_cols = ['Foundation', 'KitchenQual']
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(df[cat_cols])
encoded_cols = encoder.transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cat_cols))
df_encoded = pd.concat([df.drop(cat_cols, axis=1), encoded_df], axis=1)
print(df_encoded.head())

   TotRmsAbvGrd  WoodDeckSF  YrSold  1stFlrSF  SalePrice  Foundation_BrkTil  \
0             8           0    2008       856     208500                0.0   
1             6         298    2007      1262     181500                0.0   
2             6           0    2008       920     223500                0.0   
3             7           0    2006       961     140000                1.0   
4             9         192    2008      1145     250000                0.0   

   Foundation_CBlock  Foundation_PConc  Foundation_Slab  Foundation_Stone  \
0                0.0               1.0              0.0               0.0   
1                1.0               0.0              0.0               0.0   
2                0.0               1.0              0.0               0.0   
3                0.0               0.0              0.0               0.0   
4                0.0               1.0              0.0               0.0   

   Foundation_Wood  KitchenQual_Ex  KitchenQual_Fa  KitchenQua

In [25]:
test_data = pd.read_csv('../data/test.csv')

In [26]:
df_test=test_data.copy()

In [27]:
cat_cols = ['Foundation', 'KitchenQual']
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(df[cat_cols])
encoded_cols = encoder.transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cat_cols))
df_encoded = pd.concat([df.drop(cat_cols, axis=1), encoded_df], axis=1)
encoder.fit(df[cat_cols])
encoded_cols_test = encoder.transform(df_test[cat_cols])
encoded_df_test = pd.DataFrame(encoded_cols_test, columns=encoder.get_feature_names_out(cat_cols))
df_test_encoded = pd.concat([df_test.drop(cat_cols, axis=1), encoded_df_test], axis=1)


In [28]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')
useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
label_col = 'SalePrice'
X_train = train_data[useful_features]
y_train = train_data[label_col]
X_test = test_data[useful_features]



In [29]:
df = pd.read_csv('../data/train.csv')

useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
label_col = 'SalePrice'

In [30]:
train_data, test_data = train_test_split(df, test_size=0.33, random_state=42)

X_train, y_train = train_data[useful_features], train_data[label_col]
X_test, y_test = test_data[useful_features], test_data[label_col]


In [31]:
continuous_columns = X_train.select_dtypes(include='number').columns
scaler = StandardScaler()
scaler.fit(X_train[continuous_columns])

X_train_continuous = pd.DataFrame(scaler.transform(X_train[continuous_columns]), columns=continuous_columns)
X_test_continuous = pd.DataFrame(scaler.transform(X_test[continuous_columns]), columns=continuous_columns)

In [32]:

categorical_columns = X_train.select_dtypes(include='object').columns
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(X_train[categorical_columns])

X_train_encoded = pd.DataFrame(encoder.transform(X_train[categorical_columns]), columns=encoder.get_feature_names_out(categorical_columns))
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_columns]), columns=encoder.get_feature_names_out(categorical_columns))

X_train_final = pd.concat([X_train_continuous, X_train_encoded], axis=1)
X_test_final = pd.concat([X_test_continuous, X_test_encoded], axis=1)


In [33]:

model = LinearRegression()
model.fit(X_train_final, y_train)

In [34]:
y_pred = model.predict(X_test_final)
rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
print(f"RMSLE: {rmsle}")


RMSLE: 0.21719582445130048


# Model Building


## Model training 


In [35]:
train_data = pd.read_csv('../data/train.csv')
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(columns=['SalePrice']), train_data['SalePrice'], test_size=0.33, random_state=42)



In [36]:
continuous_columns = ['TotRmsAbvGrd', 'WoodDeckSF', '1stFlrSF']
categorical_columns = ['Foundation', 'KitchenQual']

In [37]:
scaler = StandardScaler()
scaler.fit(X_train[continuous_columns])
X_train_scaled = scaler.transform(X_train[continuous_columns])

In [38]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(X_train[categorical_columns])
encoded_train = encoder.transform(X_train[categorical_columns])
encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(categorical_columns))
X_train_processed = np.concatenate([X_train_scaled, encoded_train_df], axis=1)

In [39]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_processed, y_train)

In [40]:
# Model inference


In [42]:
test_data = pd.read_csv('../data/test.csv')
X_test_continuous = test_data[continuous_columns]
X_test_continuous_scaled = scaler.transform(X_test_continuous)
X_test_categorical = test_data[categorical_columns]
encoded_test = encoder.transform(X_test_categorical)
encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(categorical_columns))
X_test_processed = np.concatenate([X_test_continuous_scaled, encoded_test_df], axis=1)

In [43]:
y_pred = model.predict(X_test_processed)

In [None]:
submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': y_pred})
submission.to_csv('../data/submission.csv', index=False)

# Step 2️⃣ : object persistance

In [51]:
import joblib
from joblib import dump

['../models/model.joblib']

In [None]:
# Model training
...
model.fit(X_train_processed, y_train)

In [None]:
# Save the trained model
model_filepath = '../models/model.joblib'
dump(model, model_filepath)

In [57]:
model = joblib.load("../models/model.joblib")


In [58]:

# ... code for creating the encoder and scaler ...

# Save the encoder and scaler objects to models
joblib.dump(encoder, "../models/encoder.joblib")
joblib.dump(scaler, "../models/scaler.joblib")


['../models/scaler.joblib']

In [60]:
encoder = joblib.load("../models/encoder.joblib")
scaler = joblib.load("../models/scaler.joblib")

# Step 3️⃣ : code refactoring

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [62]:
def build_model(data: pd.DataFrame) -> dict[str, str]:
    # Separate target variable and predictors
    y = data.SalePrice
    X = data.drop('SalePrice', axis=1)

    # Define the categorical and continuous features
    categorical_features = X.select_dtypes(include=['object']).columns
    continuous_features = X.select_dtypes(include=['int64', 'float64']).columns

    # Preprocessing pipeline for categorical features
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Preprocessing pipeline for numerical features
    continuous_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    # Combine the preprocessing pipelines
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features),
            ('num', continuous_transformer, continuous_features)
        ])

    # Create the model pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

    # Fit the model
    model.fit(X, y)

    # Evaluate the model performance on the training set
    predictions = model.predict(X)
    rmse = np.sqrt(((predictions - y) ** 2).mean())
    r_squared = model.score(X, y)

    # Save the model to disk
    joblib.dump(model, 'models/model.joblib')
    joblib.dump(preprocessor.named_transformers_['cat'].named_steps['onehot'], 'models/onehot.joblib')
    joblib.dump(preprocessor.named_transformers_['num'].named_steps['scaler'], 'models/scaler.joblib')

    # Return the model performance
    return {'rmse': rmse, 'r_squared': r_squared}

In [63]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    # Load the model from disk
    model = joblib.load('models/model.joblib')
    onehot_encoder = joblib.load('models/onehot.joblib')
    scaler = joblib.load('models/scaler.joblib')

    # Preprocess the input data
    categorical_features = input_data.select_dtypes(include=['object']).columns
    continuous_features = input_data.select_dtypes(include=['int64', 'float64']).columns

    categorical_data = input_data[categorical_features]
    continuous_data = input_data[continuous_features]

    categorical_data_encoded = onehot_encoder.transform(categorical_data).toarray()
    continuous_data_scaled = scaler.transform(continuous_data)

    X_processed = np.concatenate([continuous_data_scaled, categorical_data_encoded], axis=1)

    # Make predictions
    predictions = model.predict(X_processed)

    return predictions

# Step 4️⃣ : code extraction in python modules

# Step 5️⃣ : type hinting and code linting

# Step 6️⃣ (Bonus): create a python package for your model 💪


# Step 7️⃣ : merge the pw2 to main and make submission