https://www.youtube.com/watch?v=Wqmtf9SA_kk

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("nybolig_data.csv")


In [None]:
data

In [None]:
data.info()

In [None]:
from sklearn.model_selection import train_test_split
X = data.drop(['price'], axis = 1)
y = data['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
train_data = X_train.join(y_train)

In [None]:
train_data.hist(figsize = (15, 8))

In [None]:
plt.figure(figsize = (15, 8))
# sns.heatmap(train_data.corr(), annot = True, cmap = "YlGnBu")
# Do heatmap of correlation only on the numerical data
sns.heatmap(train_data.select_dtypes(include = np.number).corr(), annot = True, cmap = "YlGnBu")

In [None]:
# Skew of the data
train_data.select_dtypes(include = np.number).skew()

In [None]:
from scipy.stats import boxcox
# Testing different transformations of the postal_code
transformed_test = pd.DataFrame()
transformed_test['original'] = train_data['postal_code']
transformed_test['sqrt'] = np.sqrt(train_data['postal_code'])
transformed_test['log'] = np.log(train_data['postal_code'])
transformed_test['boxcox'], _ = boxcox(train_data['postal_code'])
transformed_test['exp'] = (train_data['postal_code']) ** 2
transformed_test['reciprocal'] = 1 / (train_data['postal_code'])
transformed_test['log_on_reciprocal'] = np.log(1 / (train_data['postal_code']))

transformed_test.hist(figsize = (15, 8))

In [None]:
# Skew of the different transformations
transformed_test.skew()

In [None]:
train_data['postal_code'] = (train_data['postal_code'] + 1) ** 2
train_data['rooms'] = np.log(train_data['rooms'] + 1)
train_data['size'] = np.log(train_data['size'] + 1)
train_data.hist(figsize = (15, 8))

In [None]:
train_data.type.value_counts()

# Regression 

Before performing regression, we encode labels for the "Type of house" and the "Energy Label". 


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn import linear_model
from sklearn.model_selection import cross_val_score

warnings.simplefilter(action='ignore', category=FutureWarning)
np.set_printoptions(suppress=True)

In [None]:
"""
The type of the house  and energy_label is a categorical variable, 
and we need to encode it to a numerical value. 
We can use the LabelEncoder from sklearn to do this.
"""
def encodeData(x_data: pd.DataFrame, scaling:bool) -> pd.DataFrame:
    #Encode the type and energy_label to numbers
    x_data['type'] = LabelEncoder().fit_transform(x_data['type'])
    x_data['energy_label'] = LabelEncoder().fit_transform(x_data['energy_label'])
    #Drop the address and URL 
    x_data = x_data.drop(['address'], axis = 1)
    x_data = x_data.drop(['url'], axis = 1)
    #If Rebuilds = None, set year_rebuilt = year_built
    x_data['year_rebuilt'] = x_data['year_rebuilt'].where(~x_data['year_rebuilt'].isna(), x_data['year_built']).astype(int)
    x_data['basement_size'] = x_data["basement_size"].fillna(0)

    #Check for any NaN values
    for index, row in x_data.iterrows():
      if row.isna().any():
        print("Row", index, "contains NaN values:")
        print(row)
        raise ValueError("NaN values in row", index)
    
    #Scale the dataset
    if scaling: 
      scaler = StandardScaler()
      x_data = pd.DataFrame(scaler.fit_transform(x_data), columns = x_data.columns)
    return x_data

encoded_X_train = encodeData(X_train, scaling=True)
encoded_X_test = encodeData(X_test, scaling=True)
display(encoded_X_train)

## Lasso and Ridge Regression 

In [None]:
def regression(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    score = cross_val_score(model, x_train, y_train, cv = 5)
    print("Scores", score)
    print("Test score: ", model.score(x_test, y_test))
    #print(x_train.columns)
    print("\nCoefficients: ")
    for feature, coef in zip(x_train.columns, model.coef_):
        print(f"{feature}: {coef}")
    print("\nIntercept: ", model.intercept_)

    #Predicting the test set results
    y_pred = model.predict(x_test)
    #Plotting the test set results
    plt.scatter(y_test, y_pred)
    plt.xlabel('True values')
    plt.ylabel('Predictions')
    #Plotting the perfect fit line
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], c = 'r')
    #Name the perfect fit line
    plt.legend(['Test values', 'Perfect fit'])
    plt.title('True values vs Predictions')
    plt.show()

#linear_model_ = linear_model.LinearRegression()
lasso_model = linear_model.Lasso(alpha = 10)
ridge_model = linear_model.Ridge(alpha = 10)
#regression(linear_model_, encoded_X_train, y_train, encoded_X_test, y_test)
regression(lasso_model, encoded_X_train, y_train, encoded_X_test, y_test)
regression(ridge_model, encoded_X_train, y_train, encoded_X_test, y_test)

For this results, we have the following: 
  1. Lasso Scores: These are cross-validation scores obtained using 5-fold cross-validation. They represent the R-squared values achieved by the Lasso model on different folds of the training data. Each score corresponds to one fold.
  2. Lasso test score: This is the R-squared score of the model on the held-out test set.
  3. Lasso coefficients: These are the weights assigned to each feature by the Lasso model. 
  4. Lasso intercept: This is the bias term of the model.