In [301]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder , StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression

In [302]:
dfs=pd.read_csv('datasets/train.csv')
tdf = pd.read_csv("datasets/test.csv")
df=dfs.copy()

In [303]:
df_price = dfs['SalePrice'] 
df_all = pd.concat([dfs.drop('SalePrice', axis=1), tdf], ignore_index=True)

In [304]:
def fill_none_categories(df):
    categorical_none_features = [
        'PoolQC',      
        'MiscFeature', 
        'Alley',       
        'Fence',       
        'FireplaceQu', 
        'GarageType',  
        'GarageFinish',
        'GarageQual', 
        'GarageCond',
        'BsmtQual',    
        'BsmtCond',
        'BsmtExposure',
        'BsmtFinType1',
        'BsmtFinType2',
        'MasVnrType'   
    ]
    
    # Filling 'None'
    for feature in categorical_none_features:
        if feature in df.columns:
            df[feature] = df[feature].fillna('None')
    
    # filling numericals
    if 'MasVnrArea' in df.columns:
        df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
    # assumuing built year is the same as garage year
    if 'GarageYrBlt' in df.columns:
        df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt']) 
    
    if 'LotFrontage' in df.columns:
        df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].median()) # filling with median
    
    if 'Electrical' in df.columns:
        df['Electrical'] = df['Electrical'].fillna('SBrkr') # only one so assuming most common
    
    return df


In [305]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

def correct_encoding(df):
    df_encoded = df.copy()
    
    # Handle quality features with a manual map
    quality_features = ['ExterQual', 'KitchenQual', 'BsmtQual', 'HeatingQC', 'GarageQual']
    quality_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    
    for feature in quality_features:
        if feature in df.columns:
            df_encoded[f'{feature}_Encoded'] = df_encoded[feature].map(quality_map).fillna(0)
    
    # Apply one-hot encoding to nominal features
    remaining_categorical = df_encoded.select_dtypes(include=['object']).columns
    df_encoded = pd.get_dummies(df_encoded, columns=remaining_categorical, drop_first=True).drop(columns=['ExterQual', 'KitchenQual', 'BsmtQual', 'HeatingQC', 'GarageQual'], errors='ignore')

    return df_encoded 


In [306]:
df_all = fill_none_categories(df_all)
df_all = correct_encoding(df_all)
df_all = df_all.fillna(0)

In [307]:
n_train = len(df)
df_train = df_all.iloc[:n_train, :]
df_test = df_all.iloc[n_train:, :]

In [308]:
lr_model = LinearRegression()
lr_model.fit(df_train, df_price)
Y_pred = lr_model.predict(df_test)

In [309]:
submission = pd.DataFrame({
    'Id': tdf['Id'],
    'SalePrice': Y_pred
})
submission.to_csv('submission.csv', index=False)