In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading Data

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

# Analyze Data

In [3]:
train.head()

In [4]:
train.describe()

In [5]:
# Seperate target from features
y_train = train['SalePrice']

In [6]:
# Data Distributoin
skewness = y_train.skew()
kurtosis = y_train.kurt()

print('Data Distribution of SalePrice(Target)')
print(f'skewness: {skewness}')
print(f'kurtosis: {kurtosis}')

First let's see how we measure the skewness and kurtosis:

<h3>Skewness</h3>
<ul>
    <li><b>Positive: </b>right skewed</li>
    <li><b>Negative: </b>left skewed</li>
    <li><b>Near Zero: </b>approximately symmetrical</li>
</ul>

<h3>Kurtosis</h3>
<ul>
    <li><b>Positive (Leptokurtic): </b>positive excess & peak at center (may have outlier)</li>
    <li><b>Negative (Platykurtic): </b>negative excess (may have fewer outlier)</li>
    <li><b>Near Zero (Mesokurtic): </b>normal distribution</li>
</ul>

    

We can see that our data is `skewness: 1.88 -> right-skewed` and `kurtosis: 6.53 -> higher-kurtosis` (distributed highly towards right and with heavy tails at centers with outliers). To see this let's see our target value in histogram:

In [7]:
# Histogram of our target [SalePrice]
y_train.hist(bins=60)

From the histogram above we can see that the data distribution is not uniform at all and have outliers.

In [8]:
# Correlaton matrix
corr_matrix = train.corr()

plt.figure(figsize=(33, 19))
sns.set(font_scale=1.45)
sns.heatmap(corr_matrix, square=True, cmap='coolwarm')

In [9]:
correlations = corr_matrix["SalePrice"].sort_values(ascending=False)
features = correlations.index[0:10]
features

In [10]:
sns.pairplot(train[features], height = 2.5)
plt.show();

# Data Preprocessing

In [11]:
training_null = pd.isnull(train).sum()
testing_null = pd.isnull(test).sum()

null = pd.concat([training_null, testing_null], axis=1, keys=["Training", "Testing"])
null

In [12]:
def miss_values_info(df):
    # Assuming your DataFrame is named 'train'
    # Calculate the percentage of missing values for each column
    missing_percentage = (df.isnull().sum() / len(df)) * 100

    # Create a DataFrame to store the missing value information
    missing_info = pd.DataFrame({
        'Column': df.columns,
        'Missing Values': df.isnull().sum(),
        'Percentage': missing_percentage
    })

    # Filter the DataFrame to include only columns with missing values
    missing_info = missing_info[missing_info['Missing Values'] > 0]
    
    # Sort the DataFrame by the percentage of missing values in descending order
    missing_info = missing_info.sort_values(by='Percentage', ascending=False)

    # Display the columns with the most missing values
    return missing_info

In [13]:
train_missing = miss_values_info(train)
print('train missing info: \n', train_missing)

In [14]:
test_missing = miss_values_info(test)
print('test missing info: \n', test_missing)

In [15]:
#Based on the description data file provided, all the variables who have meaningfull Nan

null_with_meaning = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature", "MasVnrType"]

#Replacing every Nan value with "None"

for i in null_with_meaning:
    train[i].fillna("None", inplace=True)
    test[i].fillna("None", inplace=True)

In [16]:
train_missing = miss_values_info(train)
print('train missing info: \n', train_missing)

In [17]:
test_missing = miss_values_info(test)
print('test missing info: \n', test_missing)

In [18]:
from sklearn.impute import SimpleImputer

def impute_missing_values(df):
    # List of float type columns with missing values
    float_columns = df.select_dtypes(include=['float64']).columns.tolist()

    # List of object type columns with missing values
    object_columns = df.select_dtypes(include=['object']).columns.tolist()

    # Impute missing values for float type columns with the median
    float_imputer = SimpleImputer(strategy='mean')
    df[float_columns] = float_imputer.fit_transform(df[float_columns])

    # Impute missing values for object type columns with the most frequent category ('mode')
    object_imputer = SimpleImputer(strategy='most_frequent')
    df[object_columns] = object_imputer.fit_transform(df[object_columns])

    return df

In [19]:
train = impute_missing_values(train)
test = impute_missing_values(test)

In [20]:
train_missing = miss_values_info(train)
print('train missing info: \n', train_missing)
print('\n')
test_missing = miss_values_info(test)
print('test missing info: \n', test_missing)

In [21]:
# train = train.drop(columns=['SalePrice'], axis=1)

# Feature Engineering

In [22]:
train['LogPrice'] = np.log(y_train)

In [23]:
# Create a figure with two subplots
plt.figure(figsize=(12, 5))

# First subplot: Histogram of y_train
plt.subplot(1, 2, 1)  # 1 row, 2 columns, first subplot
plt.hist(y_train, bins=40, color='blue', alpha=0.7)
plt.title("Histogram of SalePrice")
plt.xlabel("Value")
plt.ylabel("Frequency")

# Second subplot: Histogram of train['LogPrice']
plt.subplot(1, 2, 2)  # 1 row, 2 columns, second subplot
plt.hist(train['LogPrice'], bins=40, color='green', alpha=0.7)
plt.title("Histogram of log(SalePrice)")
plt.xlabel("Value")
plt.ylabel("Frequency")

# Adjust spacing between subplots
plt.tight_layout()

# Show the plots
plt.show()

In [24]:
# Data Distributoin
skewness = y_train.skew()
kurtosis = y_train.kurt()

print('Data Distribution of SalePrice(Target)')
print(f'skewness: {skewness}')
print(f'kurtosis: {kurtosis}')

In [25]:
train.info()

In [26]:
from sklearn.preprocessing import LabelEncoder

def encode_categorical_columns(df):
    # Identify categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    
    # Initialize a LabelEncoder
    label_encoder = LabelEncoder()
    
    # List of object type columns with missing values
    object_columns = df.select_dtypes(include=['object']).columns.tolist()
    
    # Encode each object column
    for column in object_columns:
        df[column] = label_encoder.fit_transform(df[column])
    
    return df

In [27]:
encoded_train = encode_categorical_columns(train)
encoded_test = encode_categorical_columns(test)

In [28]:
y_train_log = encoded_train['LogPrice']
encoded_train.drop(columns=['LogPrice', 'SalePrice'])

In [29]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit and transform on the training data
train_scaled = scaler.fit_transform(encoded_train)

# Transform the test data using the same scaler
test_scaled = scaler.transform(encoded_test)

In [30]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_scaled, y_train_log, test_size=0.2, random_state=42)

# Selecting Model

In [31]:
from pycaret import *

In [34]:
from pycaret.regression import *
exp_1 = setup(data = train_scaled,  target = y_train_log)
best_model = compare_models()

In [35]:
y_pred = best_model.predict(test_scaled)

In [36]:
sub_pred = np.exp(y_pred)

In [None]:
# Create a DataFrame for submission
submission_df = pd.DataFrame({'Id': test['Id'], 'SalePrice': y_pred})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('sample_submission.csv', index=False)