In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("AmesHousing_before.csv")

In [None]:
data

In [None]:
data.info()

In [None]:
data = data.drop(columns=["Pool QC","Fence","Alley","Misc Feature"])

In [None]:
for col in data.select_dtypes(include="object").columns: #only object for now because only ones that need "None" in string
    mode_val = data[col].mode()[0] #Find the most common value
    data[col] = data[col].fillna(mode_val if data[col].value_counts()[mode_val] > 100 else "None") #If that value appears a lot (100+), use it to fill in blanks. Otherwise, just say 'None' because it's probably missing for a reason. 

In [None]:
zeros_fill_cols = [
    'Garage Yr Blt', 'Garage Area', 'Garage Cars',
    'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2',
    'Bsmt Unf SF', 'Total Bsmt SF', 'Bsmt Full Bath', 'Bsmt Half Bath'
]

for col in zeros_fill_cols:
    data[col] = data[col].fillna(0)

# Separate for Lot frontage (estimate), random missing size
data['Lot Frontage'] = data['Lot Frontage'].fillna(data['Lot Frontage'].median())

In [None]:
data = data.drop(columns = ["Order", "PID"])

In [None]:
# Exploratory Data Analysis (EDA): Understand the data before modeling
#cleaned_data = X_train.join(y_train) # Just indexing the column to the right of X_train (since it was removed from X_train)
cleaned_data = data.copy()
cleaned_data.hist(figsize=(20,15)) # Creates histograms for each numerical column in cleaned_data
# Did cleaned_data.hist() first, but need to arrange visually how it looks so I added plot size with figsize = (width, height)

In [None]:
# Correlation with target variable
cleaned_data.select_dtypes(include="number").corr() # Computes Pearson correlation between every pair of numerical columns. Values range from +1 (as one goes up, so does the other), 0 (no correlation), -1 (as one goes up, the other goes down).

In [None]:
#plt.figure(figsize=(30,25)) # rule of thumb: figsize = (num_columns * 1, num_columns * 0.5)
sns.heatmap(cleaned_data.select_dtypes(include="number").corr(),annot=True, cmap="YlGnBu") # Heatmap, annot=True = aka we're going to see the correlation number

In [None]:
# adding  
cleaned_data['House Age'] = cleaned_data['Yr Sold'] - cleaned_data['Year Built'] 
cleaned_data['Since Remod'] = cleaned_data['Yr Sold'] - cleaned_data['Year Remod/Add']
#By doing the line below, we see there's an error in hour dataset (house sold before it's built) so we drop that row
#cleaned_data[cleaned_data['House Age'] < 0][['Yr Sold', 'Year Built', 'Age']]
cleaned_data = cleaned_data[(cleaned_data['House Age'] >= 0) & (cleaned_data['Since Remod'] >= 0)].copy()
cleaned_data['Was Remod'] = (cleaned_data['Year Built'] != cleaned_data['Year Remod/Add']).astype(int)

cleaned_data['HAS Garage'] = (cleaned_data['Garage Area'] > 0).astype(int)
# First set Garage Yr Blt = NaN for houses with no garage
cleaned_data.loc[cleaned_data['HAS Garage'] == 0, 'Garage Yr Blt'] = np.nan
cleaned_data['Garage Age'] = (cleaned_data['Yr Sold'] - cleaned_data['Garage Yr Blt']).fillna(-1)
cleaned_data['Garage Yr Blt'] = cleaned_data['Garage Yr Blt'].fillna(-1)

# 1. Grab all numerical columns from your cleaned_data
num_features = cleaned_data.select_dtypes(include='number').columns

# 2. Melt the dataframe to long format for FacetGrid
melted = pd.melt(cleaned_data, value_vars=sorted(num_features))

# 3. Plot with FacetGrid and histplot
g = sns.FacetGrid(melted, col='variable', col_wrap=4, sharex=False, sharey=False, height=3)
g.map(sns.histplot, 'value', kde=True, bins=30)

plt.tight_layout()
plt.show()

In [None]:
# log transformation for skewed columns
cleaned_data['SalePrice'] = np.log(cleaned_data['SalePrice'] + 1)
cleaned_data['Lot Frontage'] = np.log(cleaned_data['Lot Frontage'] + 1)
cleaned_data['Lot Area'] = np.log(cleaned_data['Lot Area'] + 1)
cleaned_data['1st Flr SF'] = np.log(cleaned_data['1st Flr SF'] + 1)
cleaned_data['Gr Liv Area'] = np.log(cleaned_data['Gr Liv Area'] + 1)

#cleaned_data['Bsmt Unf SF'] = np.log(cleaned_data['Bsmt Unf SF'] + 1)
cleaned_data['HAS Bsmt Unf'] = (cleaned_data['Bsmt Unf SF'] > 0).astype(int)

cleaned_data['Total Bsmt SF'] = np.log(cleaned_data['Total Bsmt SF'] + 1)

# fix the MS SubClass that's a categorical feature:
cleaned_data['MS SubClass'] = cleaned_data['MS SubClass'].astype(str)

# clean up some dataset features:
cleaned_data['HAS Enclosed Porch'] = (cleaned_data['Enclosed Porch'] > 0).astype(int)
cleaned_data['HAS 3Ssn Porch'] = (cleaned_data['3Ssn Porch'] > 0).astype(int)
cleaned_data['HAS Screen Porch'] = (cleaned_data['Screen Porch'] > 0).astype(int)
cleaned_data['HAS Open Porch SF'] = (cleaned_data['Open Porch SF'] > 0).astype(int)
cleaned_data['HAS Wood Deck SF'] = (cleaned_data['Wood Deck SF'] > 0).astype(int)
cleaned_data['HAS Pool Area'] = (cleaned_data['Pool Area'] > 0).astype(int)
cleaned_data['HAS Misc Val'] = (cleaned_data['Misc Val'] > 0).astype(int)
cleaned_data['HAS Mas Vnr'] = (cleaned_data['Mas Vnr Area'] > 0).astype(int)


cleaned_data['HAS BsmtFin 1'] = (cleaned_data['BsmtFin SF 1'] > 0).astype(int)
cleaned_data['HAS BsmtFin 2'] = (cleaned_data['BsmtFin SF 2'] > 0).astype(int)
cleaned_data['HAS 2nd Flr'] = (cleaned_data['2nd Flr SF'] > 0).astype(int)
cleaned_data['HAS Low Qual Fin'] = (cleaned_data['Low Qual Fin SF'] > 0).astype(int)
cleaned_data['HAS Bsmt Full Bath'] = (cleaned_data['Bsmt Full Bath'] > 0).astype(int)
cleaned_data['HAS Bsmt Half Bath'] = (cleaned_data['Bsmt Half Bath'] > 0).astype(int)

# one-hot encode categorical columns
categorical_cols = cleaned_data.select_dtypes(include = 'object').columns
cleaned_data = pd.get_dummies(cleaned_data, columns = categorical_cols, drop_first = True)

In [None]:
# 1. Grab all numerical columns from your cleaned_data
num_features = cleaned_data.select_dtypes(include='number').columns

# 2. Melt the dataframe to long format for FacetGrid
melted = pd.melt(cleaned_data, value_vars=sorted(num_features))

# 3. Plot with FacetGrid and histplot
g = sns.FacetGrid(melted, col='variable', col_wrap=4, sharex=False, sharey=False, height=3)
g.map(sns.histplot, 'value', kde=True, bins=30)

plt.tight_layout()
plt.show()

In [None]:
pd.set_option('display.max_columns', None)
cleaned_data.head()
cleaned_data.to_csv("preprocessed_data.csv", index=False)
# Calculate skewness of numeric features
skew_values = cleaned_data.select_dtypes(include='number').skew()

# Sort by absolute skew (most skewed features first)
skew_sorted = skew_values.sort_values(ascending=False)

# Show top skewed features
print(skew_sorted)

In [None]:
f = pd.melt(cleaned_data, id_vars=['SalePrice'], value_vars=sorted(num_features))
g = sns.FacetGrid(f, col='variable', col_wrap=4, sharex=False, sharey=False)
g = g.map(sns.regplot, 'value', 'SalePrice', scatter_kws={'alpha':0.3})
[plt.setp(ax.get_xticklabels(), rotation=60) for ax in g.axes.flat]
g.fig.tight_layout()
plt.show()

from sklearn.model_selection import train_test_split

X = cleaned_data.drop(columns=['SalePrice'])
y = cleaned_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Time for selecting our model:
from sklearn.linear_model import LinearRegression # use Linear Regression model from scikit=learn
from sklearn.metrics import mean_squared_error, r2_score # we got tools to check how good the model is, after making predictions
#starting with linear first

# 1. Train the model
lr_model = LinearRegression() # creates an empty Line Regression model, not trained
lr_model.fit(X_train, y_train) # we train the model using training data (X_train and y_train) (model learns the pattern and tries to find the best line that fits the data

# 2. Predict
y_pred = lr_model.predict(X_test) # use the trained model to make predicitons on the new data (X_test) to get an estimate of what the y values should be

# 3. Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # we compare the model's predicitons (y_pred) with the actual values (y_test) to calculate the Root Mean Squared Error (RMSE) the smaller it is the better 
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R²:", r2)

plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual SalePrice")
plt.ylabel("Predicted SalePrice")
plt.title("Actual vs. Predicted SalePrice")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')  # perfect prediction line
plt.show()

In [None]:
import joblib

# Save model
joblib.dump(lr_model, "linear_model.pkl")

# Save the list of columns your model expects
joblib.dump(X_train.columns.tolist(), "model_features.pkl")

In [None]:
import joblib

# Load model and expected features
model = joblib.load("linear_model.pkl")
columns = joblib.load("model_features.pkl")

In [None]:
# Your manually set values
custom_values = {
    'Gr Liv Area': 1500,
    'Garage Cars': 2,
    'Overall Qual': 6,
    'Year Built': 2003,
}

# Create the full row with default 0s
input_data = pd.DataFrame([0] * len(columns), index=columns).T

# Fill in your custom values
for key, val in custom_values.items():
    if key in input_data.columns:
        input_data.at[0, key] = val

# Predict
log_price = model.predict(input_data)[0]
price = np.exp(log_price)
print(f"Predicted Sale Price: ${price:,.0f}")

In [None]:
print("Raw log prediction:", log_price)

In [None]:
print("Input shape:", input_data.shape)
print("Expected shape:", len(columns))
print("All columns match:", all(input_data.columns == columns))