In [22]:
import pandas as pd
import numpy as np

In [23]:
train_data = pd.read_csv('../data/train.csv')
test_data=pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')


In [24]:
df = train_data.copy()
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [25]:
useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
label_col = 'SalePrice'
df = df[useful_features + [label_col]]
df.head()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
0,PConc,Gd,8,0,2008,856,208500
1,CBlock,TA,6,298,2007,1262,181500
2,PConc,Gd,6,0,2008,920,223500
3,BrkTil,Gd,7,0,2006,961,140000
4,PConc,Gd,9,192,2008,1145,250000


In [26]:
df.drop_duplicates()
df.tail()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
1455,PConc,TA,7,0,2007,953,175000
1456,CBlock,TA,7,349,2010,2073,210000
1457,Stone,Gd,9,0,2010,1188,266500
1458,CBlock,Gd,5,366,2010,1078,142125
1459,CBlock,TA,6,736,2008,1256,147500


In [27]:
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,SalePrice
1455,PConc,TA,7,0,2007,953,175000
1456,CBlock,TA,7,349,2010,2073,210000
1457,Stone,Gd,9,0,2010,1188,266500
1458,CBlock,Gd,5,366,2010,1078,142125
1459,CBlock,TA,6,736,2008,1256,147500


# Scale continuous features

In [28]:
continuous_columns = df[useful_features].select_dtypes(include='number').columns
continuous_columns

Index(['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF'], dtype='object')

In [29]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df[continuous_columns])

In [30]:
scaled_columns = scaler.transform(df[continuous_columns])
scaled_columns

array([[ 0.91220977, -0.75217584,  0.13877749, -0.79343379],
       [-0.31868327,  1.62619479, -0.61443862,  0.25714043],
       [-0.31868327, -0.75217584,  0.13877749, -0.62782603],
       ...,
       [ 1.52765629, -0.75217584,  1.64520971,  0.06565646],
       [-0.93412978,  2.16891024,  1.64520971, -0.21898188],
       [-0.31868327,  5.12192075,  0.13877749,  0.2416147 ]])

In [31]:
continuous_features_df = pd.DataFrame(data=scaled_columns, columns=continuous_columns)
continuous_features_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
0,0.91221,-0.752176,0.138777,-0.793434
1,-0.318683,1.626195,-0.614439,0.25714
2,-0.318683,-0.752176,0.138777,-0.627826
3,0.296763,-0.752176,-1.367655,-0.521734
4,1.527656,0.780197,0.138777,-0.045611


## Categorical features

In [32]:
categorical_columns = df[useful_features].select_dtypes(include='object').columns
categorical_columns

Index(['Foundation', 'KitchenQual'], dtype='object')

In [33]:
categorical_features_df = pd.get_dummies(df[categorical_columns])
categorical_features_df.head()

Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,0,0,1,0,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,1,0


In [34]:
final_df = continuous_features_df.join(categorical_features_df).join(df[label_col])
final_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,SalePrice
0,0.91221,-0.752176,0.138777,-0.793434,0,0,1,0,0,0,0,0,1,0,208500
1,-0.318683,1.626195,-0.614439,0.25714,0,1,0,0,0,0,0,0,0,1,181500
2,-0.318683,-0.752176,0.138777,-0.627826,0,0,1,0,0,0,0,0,1,0,223500
3,0.296763,-0.752176,-1.367655,-0.521734,1,0,0,0,0,0,0,0,1,0,140000
4,1.527656,0.780197,0.138777,-0.045611,0,0,1,0,0,0,0,0,1,0,250000


# Model training

In [35]:
X, y = final_df.drop(columns=[label_col]), final_df[label_col]
X, y = final_df.drop(columns=[label_col]), final_df[label_col]

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [37]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()


In [38]:
model.fit(X_train, y_train)

# Model evaluation

In [39]:
y_pred = model.predict(X_test)
y_pred[y_pred < 0] = 0

In [40]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)
compute_rmsle(y_test, y_pred)

0.22

## Testing dataframe equality

In [41]:
final_df.to_parquet('../data' + 'processed_df.parquet', index=False)

In [42]:
processed_df = pd.read_parquet('../data' + 'processed_df.parquet')
processed_df.head()

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,SalePrice
0,0.91221,-0.752176,0.138777,-0.793434,0,0,1,0,0,0,0,0,1,0,208500
1,-0.318683,1.626195,-0.614439,0.25714,0,1,0,0,0,0,0,0,0,1,181500
2,-0.318683,-0.752176,0.138777,-0.627826,0,0,1,0,0,0,0,0,1,0,223500
3,0.296763,-0.752176,-1.367655,-0.521734,1,0,0,0,0,0,0,0,1,0,140000
4,1.527656,0.780197,0.138777,-0.045611,0,0,1,0,0,0,0,0,1,0,250000


In [43]:
pd.testing.assert_frame_equal(processed_df, final_df)

pd.testing.assert_frame_equal(processed_df, final_df.drop(columns=[label_col]))

# PW2

In [12]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor


In [13]:
data = pd.read_csv('../data/train.csv')
X_train, X_test, y_train, y_test = train_test_split(data.drop('SalePrice', axis=1), data['SalePrice'], test_size=0.2, random_state=42)

categorical_columns = ['Foundation', 'KitchenQual']
continuous_columns = ['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']

cat_encoder = OneHotEncoder()
X_train_cat = cat_encoder.fit_transform(X_train[categorical_columns])
X_test_cat = cat_encoder.transform(X_test[categorical_columns])
num_scaler = StandardScaler()
X_train_num = num_scaler.fit_transform(X_train[continuous_columns])
X_test_num = num_scaler.transform(X_test[continuous_columns])


X_train = pd.concat([pd.DataFrame(X_train_cat.toarray(), columns=cat_encoder.get_feature_names_out(categorical_columns)), pd.DataFrame(X_train_num, columns=continuous_columns)], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_cat.toarray(), columns=cat_encoder.get_feature_names_out(categorical_columns)), pd.DataFrame(X_test_num, columns=continuous_columns)], axis=1)


In [14]:
model = LinearRegression()
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R2 Score: {r2:.2f}")


Mean Squared Error: 2190916075.66
R2 Score: 0.71


## Model Building


### Model training 


In [17]:
#Dataset loading and splitting into train and test
data = pd.read_csv('../data/train.csv')
X_train, X_test, y_train, y_test = train_test_split(data.drop('SalePrice', axis=1), data['SalePrice'], test_size=0.2, random_state=42)


In [18]:
#Preprocessing and feature engineering of the train set

cat_cols = ['Foundation', 'KitchenQual']
num_cols = ['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']

encoder = OneHotEncoder()
encoder.fit(X_train[cat_cols])

scaler = StandardScaler()
scaler.fit(X_train[num_cols])

X_train_cat = encoder.transform(X_train[cat_cols])
X_train_num = scaler.transform(X_train[num_cols])
X_train_processed = np.concatenate([X_train_cat.toarray(), X_train_num], axis=1)

In [19]:
#Model training
model = LinearRegression().fit(X_train_processed, y_train)

### Model evaluation

In [20]:
#Preprocessing and feature engineering of the test set

X_test_cat = encoder.transform(X_test[cat_cols])
X_test_num = scaler.transform(X_test[num_cols])
X_test_processed = np.concatenate([X_test_cat.toarray(), X_test_num], axis=1)


In [21]:
#Model predictions on the test set

y_pred = model.predict(X_test_processed)


In [22]:
#Model evaluation

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)

Mean Squared Error: 2192849019.441781
Root Mean Squared Error: 46827.86584333928
Mean Absolute Error: 30706.510273972603
R2 Score: 0.7141126778432092


## Model inference


In [136]:
#Prepare Submission File
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predicted_prices})
my_submission.to_csv('price_pred.csv', index=False)

# Step 2️⃣ : object persistance

In [48]:
import joblib
from joblib import dump

In [49]:
import joblib

# save the trained model
joblib.dump(model, '../models/model.joblib')

# save the encoders and scalers
joblib.dump(encoder, '../models/encoder.joblib')
joblib.dump(scaler, '../models/scaler.joblib')

['../models/scaler.joblib']

In [50]:
# load the trained model
model = joblib.load('../models/model.joblib')

# load the encoders and scalers
encoder = joblib.load('../models/encoder.joblib')
scaler = joblib.load('../models/scaler.joblib')


## inference 

In [57]:
###model inference
test_data = pd.read_csv('../data/test.csv')
cat_cols = ['Foundation', 'KitchenQual']
num_cols = ['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
test_data[cat_cols] = test_data[cat_cols].fillna(test_data[cat_cols].mode().iloc[0])

X_test = test_data.drop("Id", axis=1)

X_test[num_cols] = scaler.transform(X_test[num_cols])
X_test_processed = encoder.transform(X_test[cat_cols])

# Model predictions on the test set
predictions = model.predict(np.hstack((X_test[num_cols], X_test_processed.toarray())))
#y_pred = model.predict(np.hstack((X_test[continuous_features], X_test_processed.toarray())))

# Model evaluation
output = pd.DataFrame({"Id": test_data["Id"], "SalePrice": predictions})
print(output)
#joblib.dump(output)
#joblib.dump(output, '../models/model.joblib')
output.to_csv('../data/pred.csv', index=False)

        Id     SalePrice
0     1461  1.074233e+18
1     1462  2.162314e+18
2     1463  1.692660e+17
3     1464  6.865798e+17
4     1465 -2.265990e+17
...    ...           ...
1454  2915 -4.035423e+17
1455  2916 -2.213934e+17
1456  2917  1.556380e+18
1457  2918 -9.912193e+17
1458  2919 -1.746918e+17

[1459 rows x 2 columns]


# Step 3️⃣ : code refactoring

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import joblib


In [2]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [32]:
def build_model(data: pd.DataFrame) -> dict[str, float]:
    """Orchestrates the different steps for the model building phase and returns a dictionary with the model performances."""
    
    # Splitting data into features and target variable
    X_train = data.drop(["Id", "SalePrice"], axis=1)
    y_train = data["SalePrice"]
    
    # Splitting data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    # Defining continuous and categorical features
    continuous_features = ['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
    categorical_features = ['Foundation', 'KitchenQual']  

    
    # Scaling continuous features and encoding categorical features
    scaler = StandardScaler()
    encoder = OneHotEncoder(handle_unknown="ignore")
    scaler.fit(X_train[continuous_features])
    X_train[continuous_features] = scaler.transform(X_train[continuous_features])
    encoder.fit(X_train[categorical_features])
    X_train_processed = encoder.transform(X_train[categorical_features])
    
    # Fitting the model on the preprocessed training data
    model = LinearRegression()
    model.fit(np.hstack((X_train[continuous_features], X_train_processed.toarray())), y_train)
    
    # Preprocessing the test data and generating predictions
    X_test[continuous_features] = scaler.transform(X_test[continuous_features])
    X_test_processed = encoder.transform(X_test[categorical_features])
    y_pred = model.predict(np.hstack((X_test[continuous_features], X_test_processed.toarray())))
    
    # Computing model performances and returning them in a dictionary
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    rmsle = compute_rmsle(np.log(y_test), np.log(y_pred))
    return {"rmse": rmse, "rmsle": rmsle}




In [61]:
def make_predictions(input_data):
    # Load the trained model and preprocessing objects
    model = joblib.load('../models/model.joblib')
    scaler = joblib.load('../models/scaler.joblib')
    encoder = joblib.load('../models/encoder.joblib')

    # Identify categorical and continuous features
    categorical_features = ['Foundation', 'KitchenQual']
    continuous_features = ['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']

    # Replace missing values in categorical features with the most frequent category
    imputer = SimpleImputer(strategy='most_frequent')
    input_data[categorical_features] = imputer.fit_transform(input_data[categorical_features])

    # Apply the same preprocessing steps used during training
    input_data[continuous_features] = scaler.transform(input_data[continuous_features])
    input_data_processed = encoder.transform(input_data[categorical_features])

    # Make predictions
    predictions = model.predict(np.hstack((input_data[continuous_features], input_data_processed.toarray())))

    return predictions



In [59]:
build_model(pd.read_csv('../data/train.csv'))

{'rmse': 46762.55031512341, 'rmsle': 0.02}

In [60]:
make_predictions(pd.read_csv('../data/test.csv'))

array([ 1.07423269e+18,  2.16231358e+18,  1.69265975e+17, ...,
        1.55638026e+18, -9.91219259e+17, -1.74691834e+17])