In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

In [3]:
train_data=pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
test_data=pd.read_csv('house-prices-advanced-regression-techniques/test.csv')

In [4]:
def columns_with_many_nulls(dataframe):
    null_counts = dataframe.isnull().sum()
    return null_counts

# Call the method passing your DataFrame as argument
columns_with_many_nulls = columns_with_many_nulls(train_data)

# Display columns with more than a certain number of null values
print(columns_with_many_nulls)


Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64


In [19]:
def handle_missing_values(df):
    # Fill missing values for 'LotFrontage' column
    df['LotFrontage'].fillna(df['LotFrontage'].mean(), inplace=True)
    df['MasVnrArea'].fillna(df['MasVnrArea'].mean(), inplace=True)
    df['GarageYrBlt'].fillna(0, inplace=True)

    
    # List of columns to fill missing values
    columns_to_fill = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath','GarageCars', 'GarageArea', 'TotalBathrooms','TotalRooms','Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
    
    # Iterate over columns and fill missing values if the column exists
    for column in columns_to_fill:
        if column in df.columns:
            df[column].fillna(f'No{column}', inplace=True)
    
    return df


In [6]:
def convert_categorical_columns(dataframe):
    for column in dataframe.columns:
        if dataframe[column].dtype == 'object':
            dataframe = dataframe.join(pd.get_dummies(dataframe[column], prefix=column)).drop([column], axis=1)
    return dataframe

In [7]:
def label_encode_boolean_variables(dataframe):
    # Initialize LabelEncoder
    label_encoder = LabelEncoder()

    # Iterate over each column in the DataFrame
    for column in dataframe.columns:
        # Check if the column contains boolean values
        if dataframe[column].dtype == bool:
            # Apply label encoding to boolean variables
            dataframe[column] = label_encoder.fit_transform(dataframe[column])

    return dataframe

In [8]:
def add_additional_features(data):
    # Calculate the age of the property
    data['YearOld'] = 2024-data['YearBuilt'] 
    data['YearRemodOld'] = 2024 - data['YearRemodAdd'] 
    
    # Calculate total square footage
    data['TotalSquareFeet'] = data['1stFlrSF'] + data['2ndFlrSF'] + data['LowQualFinSF'] + data['GrLivArea']
    
    # Calculate total number of bathrooms
    data['TotalBathrooms'] = data['BsmtFullBath'] + data['BsmtHalfBath'] + data['FullBath'] + data['HalfBath']
    
    # Assign total number of bedrooms
    data['TotalBedrooms'] = data['BedroomAbvGr']
    
    # Calculate total number of rooms
    data['TotalRooms'] = data['TotalBedrooms'] + data['TotalBathrooms'] + data['KitchenAbvGr']
    
    return data


Linear Regression

In [9]:
# Concatenate training and test datasets
combined_data = pd.concat([train_data, test_data], ignore_index=True)

In [10]:
# Handle missing values and encode categorical columns
combined_data = handle_missing_values(combined_data)

In [11]:
combined_data = convert_categorical_columns(combined_data)

In [12]:
combined_data = label_encode_boolean_variables(combined_data)

In [14]:
combined_data = add_additional_features(combined_data)

In [15]:
combined_data = combined_data.join(pd.get_dummies(combined_data['MSSubClass'], prefix='MSSubClass')).drop(['MSSubClass'], axis=1)

In [16]:
# Split the combined dataset back into training and test sets
train_data = combined_data[:len(train_data)]
test_data = combined_data[len(train_data):]

In [17]:

X_test = test_data.drop(['SalePrice'], axis=1)
# Drop the target variable from the test set*
columns_to_fill = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath','GarageCars', 'GarageArea', 'TotalBathrooms','TotalRooms']
    # Iterate over columns and fill missing values if the column exists
for column in columns_to_fill:
    if column in X_test.columns:
        X_test[column].fillna(X_test[column].mean(), inplace=True)
#X_test.dropna(inplace=True)
X_train = train_data.drop(['SalePrice'], axis=1)
y_train = train_data['SalePrice']
# Print columns with NaN values
nan_columns = X_test.columns[X_test.isnull().sum() > 0]
print(nan_columns)
print(X_train.info())
print(X_test.info())

Index([], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 315 entries, Id to MSSubClass_190
dtypes: bool(16), float64(13), int64(286)
memory usage: 3.4 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 1460 to 2918
Columns: 315 entries, Id to MSSubClass_190
dtypes: bool(16), float64(13), int64(286)
memory usage: 3.4 MB
None


In [18]:
# Fit the model on the training data
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions on the test data
predictions = model.predict(X_test)
predictions_df = pd.DataFrame({'Id': X_test['Id'], 'SalePrice': predictions})
# Write the DataFrame to a CSV file
predictions_df.to_csv('house-prices-advanced-regression-techniques/predictions.csv', index=False)
for prediction in predictions:
    print(prediction)

112118.62774086246
162266.02283806042
188151.50642243985
198117.07839512898
206810.57935308447
171353.07564507905
175450.3099764946
157406.46526940973
212895.34877888145
115683.34014132954
154805.79403526362
98540.08495136918
94455.410189327
147482.10640758593
107729.4277387544
377015.5489714814
263343.2043862627
311767.85569162655
304175.0743091943
487082.1211139423
312562.03196992155
204830.69894612997
177884.93653847274
162518.72428432215
176215.6685509802
188795.71606550328
335377.4336344014
219267.77920281296
203602.02197213648
256774.4906655311
202673.07619107337
106047.22272533757
185674.41240088895
303957.908268112
289644.01243767457
244424.22718845445
175094.08312251806
175126.7773795248
162224.61510539762
148885.87070956585
197039.6488601657
132901.9472784011
356013.09563612286
234092.81798949497
225172.64955847556
192893.34090430845
259261.63365840577
198009.80154743927
155083.87944965184
143844.59014699436
160999.24082748548
177013.41734803314
150894.39908423895
171440.7688