IMPORTING LIBRAIRIES

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

IMPORTING DATA

In [2]:
train_data=pd.read_csv('./train.csv')
test_data=pd.read_csv('./test.csv')

HANDLING MISSING VALUES

In [3]:
def columns_with_many_nulls(dataframe):
    null_counts = dataframe.isnull().sum()
    return null_counts

# Call the method passing your DataFrame as argument
columns_with_many_nulls = columns_with_many_nulls(train_data)

# Display columns with more than a certain number of null values
print(columns_with_many_nulls)


Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64


In [4]:
def handle_missing_values(df):
    # Fill missing values for 'LotFrontage' column
    df['LotFrontage'].fillna(df['LotFrontage'].mean().round(2), inplace=True)
    df['MasVnrArea'].fillna(df['MasVnrArea'].mean().round(2), inplace=True)
    df['GarageYrBlt'].fillna(0, inplace=True)
    df['GarageArea'].fillna(0, inplace=True)
    df['BsmtFinSF1'].fillna(0, inplace=True)
    df['BsmtFinSF2'].fillna(0, inplace=True)
    df['BsmtUnfSF'].fillna(0, inplace=True)
    df['TotalBsmtSF'].fillna(0, inplace=True)
    df['BsmtFullBath'].fillna(0, inplace=True)
    df['BsmtHalfBath'].fillna(0, inplace=True)
    df['GarageCars'].fillna(0, inplace=True)

    # List of columns to fill missing values
    columns_to_fill = [  'Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
    
    # Iterate over columns and fill missing values if the column exists
    for column in columns_to_fill:
        if column in df.columns:
            df[column].fillna(f'No{column}', inplace=True)
    
    return df


FEATURE CLASSIFICATION

In [5]:
def convert_categorical_columns(dataframe):
    for column in dataframe.columns:
        if dataframe[column].dtype == 'object':
            dataframe = dataframe.join(pd.get_dummies(dataframe[column], prefix=column)).drop([column], axis=1)
    return dataframe

In [6]:
def label_encode_boolean_variables(dataframe):
    # Initialize LabelEncoder
    label_encoder = LabelEncoder()

    # Iterate over each column in the DataFrame
    for column in dataframe.columns:
        # Check if the column contains boolean values
        if dataframe[column].dtype == bool:
            # Apply label encoding to boolean variables
            dataframe[column] = label_encoder.fit_transform(dataframe[column])

    return dataframe

FEATURE ENGINEERING

In [7]:
def add_additional_features(data):
    # Calculate the age of the property
    data['YearOld'] = 2024-data['YearBuilt'] 
    data['YearRemodOld'] = 2024 - data['YearRemodAdd'] 
    
    # Calculate total square footage
    data['TotalSquareFeet'] = data['1stFlrSF'] + data['2ndFlrSF'] + data['LowQualFinSF'] + data['GrLivArea']
    
    # Calculate total number of bathrooms
    data['TotalBathrooms'] = data['BsmtFullBath'] + data['BsmtHalfBath'] + data['FullBath'] + data['HalfBath']
    
    # Assign total number of bedrooms
    data['TotalBedrooms'] = data['BedroomAbvGr']
    
    # Calculate total number of rooms
    data['TotalRooms'] = data['TotalBedrooms'] + data['TotalBathrooms'] + data['KitchenAbvGr']
    
    return data


Linear Regression

In [8]:
# Concatenate training and test datasets
combined_data = pd.concat([train_data, test_data], ignore_index=True)

In [9]:
# Handle missing values and encode categorical columns
combined_data = handle_missing_values(combined_data)

In [10]:
combined_data = convert_categorical_columns(combined_data)

In [11]:
combined_data = label_encode_boolean_variables(combined_data)

In [13]:
combined_data = add_additional_features(combined_data)

In [14]:
combined_data = combined_data.join(pd.get_dummies(combined_data['MSSubClass'], prefix='MSSubClass')).drop(['MSSubClass'], axis=1)

In [15]:
# Drop the target variable from the test set*
columns_to_fill = [  'TotalBathrooms','TotalRooms']
    # Iterate over columns and fill missing values if the column exists
for column in columns_to_fill:
    if column in combined_data.columns:
        combined_data[column].fillna(combined_data[column].mean().round(2), inplace=True)

In [16]:
# Split the combined dataset back into training and test sets
train_data = combined_data[:len(train_data)]
test_data = combined_data[len(train_data):]

In [17]:

X_test = test_data.drop(['SalePrice'], axis=1)

#X_test.dropna(inplace=True)
X_train = train_data.drop(['SalePrice'], axis=1)
y_train = train_data['SalePrice']
# Print columns with NaN values
nan_columns = X_test.columns[X_test.isnull().sum() > 0]
print(nan_columns)
print(X_train.info())
print(X_test.info())

Index([], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 315 entries, Id to MSSubClass_190
dtypes: bool(16), float64(13), int64(286)
memory usage: 3.4 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 1460 to 2918
Columns: 315 entries, Id to MSSubClass_190
dtypes: bool(16), float64(13), int64(286)
memory usage: 3.4 MB
None


In [21]:
%env LOKY_MAX_CPU_COUNT=<number_of_cores>


env: LOKY_MAX_CPU_COUNT=<number_of_cores>


In [22]:
# Fit the model on the training data
#model = LinearRegression()
#model = RandomForestRegressor(n_estimators=200, random_state=123)
model = KNeighborsRegressor(n_neighbors=5)

model.fit(X_train, y_train)
# Make predictions on the test data
predictions = model.predict(X_test)
predictions_df = pd.DataFrame({'Id': X_test['Id'], 'SalePrice': predictions})
# Write the DataFrame to a CSV file
predictions_df.to_csv('./predictions.csv', index=False)