In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor 

In [None]:
data = pd.read_csv('./datasets/regression/housing.csv')

print(data.head(10))

In [None]:
print(data.describe())

In [None]:
op_count = data['ocean_proximity'].value_counts()
plt.figure(figsize=(10,5))
sns.barplot(x=op_count.index, y=op_count.values, alpha=0.7)
plt.title('Ocean Proximity Summary')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Ocean Proximity', fontsize=12)
plt.show()

In [None]:
# target dataframe: housing
target = "median_house_value"
features = list(data.columns.drop("median_house_value"))
feature_df = data[features]

prediction_df = data

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
data[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist()
text_cols = feature_df.select_dtypes(include=['string']).columns.tolist()


sample_size = np.min([10000, data.shape[0]])
unique_theshold = np.min([100, sample_size/10])

# check categorical columns for high cardinality and make it text column
for col in categorical_cols:
    if data[col].sample(sample_size).nunique() > unique_theshold:
        text_cols.append(col)
        categorical_cols.remove(col)
        

# check text columns for low cardinality and make it categorical columns
for col in text_cols:
    if data[col].sample(sample_size).nunique() < unique_theshold:
        categorical_cols.append(col)
        text_cols.remove(col)

print(numerical_cols)
print(categorical_cols)
print(text_cols)

# define numeric transformer steps
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", MinMaxScaler())]
)

# define categorical transformer steps
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# define text transformer steps
text_transformer = Pipeline(
    steps=[
        ('text', TfidfVectorizer())
    ]
)

# create the preprocessing pipelines for both numeric and categorical data
preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer , numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        *[(f'text_{t_col}', text_transformer, t_col) for t_col in text_cols]]
)


In [10]:
# Train and Test split
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_comparison_list = []

##### End of Data Processing Pipeline #####

In [None]:
random_forest_regression_param_grid = {
"random_forest_regression__n_estimators": np.arange(1, 15, 20),
"random_forest_regression__max_depth": np.arange(5, 50, 10),
"random_forest_regression__min_samples_leaf": np.arange(1, 50, 20),
}


# Create the pipeline
random_forest_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('random_forest_regression', RandomForestRegressor())
])

# Create the grid search
random_forest_regression_grid_search = GridSearchCV(estimator=random_forest_regression_pipe, param_grid=random_forest_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=3)
random_forest_regression_grid_search.fit(X_train, y_train)


In [None]:
# Get the best hyperparameters
random_forest_regression_best_estimator = random_forest_regression_grid_search.best_estimator_

# Store results as a dataframe  
random_forest_regression_search_results = pd.DataFrame(random_forest_regression_grid_search.cv_results_)

print(random_forest_regression_search_results)

In [None]:
# Model metrics

# Generate Predictions
random_forest_regression_predictions = random_forest_regression_best_estimator.predict(X_test)
random_forest_regression_predictions_df = pd.DataFrame(random_forest_regression_best_estimator.predict(X_test))

# Generate Model Metrics
random_forest_regression_r2_score = r2_score(y_test, random_forest_regression_predictions_df.iloc[:,0])
random_forest_regression_mean_squared_error = mean_squared_error(y_test, random_forest_regression_predictions_df.iloc[:,0])
random_forest_regression_explained_variance_score = explained_variance_score(y_test, random_forest_regression_predictions_df.iloc[:,0])
random_forest_regression_performance_metrics = [['random_forest_regression','r2_score', random_forest_regression_r2_score], 
                                  ['random_forest_regression','mean_squared_error',random_forest_regression_mean_squared_error],
                                  ['random_forest_regression','explained_variance_score', random_forest_regression_explained_variance_score]]
random_forest_regression_performance_metrics = pd.DataFrame(random_forest_regression_performance_metrics, columns=['model','metric', 'value'])

# Generate Actual vs Predicted Plot
random_forest_regression_actual_predicted_plot, random_forest_regression_actual_predicted_plot_ax = plt.subplots()
random_forest_regression_actual_predicted_plot = random_forest_regression_actual_predicted_plot_ax.scatter(x=y_test, y=random_forest_regression_predictions_df.iloc[:,0], alpha=0.5)
# Add diagonal line
random_forest_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
random_forest_regression_actual_predicted_plot_ax.set_xlabel('Actual')
random_forest_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
random_forest_regression_actual_predicted_plot_ax.set_title(f'random_forest_regression Actual vs. Predicted')
plt.show(block=False)

In [None]:
# Generate Decile Lift Chart
# Calculate the deciles based on the residuals
random_forest_regression_deciles = np.percentile(random_forest_regression_predictions, np.arange(0, 100, 10))
# Calculate the mean actual and predicted values for each decile
random_forest_regression_mean_actual = []
random_forest_regression_mean_predicted = []
for i in range(len(random_forest_regression_deciles) - 1):
    mask = (random_forest_regression_predictions >= random_forest_regression_deciles[i]) & (random_forest_regression_predictions < random_forest_regression_deciles[i + 1])
    random_forest_regression_mean_actual.append(int(np.mean(y_test[mask])))
    random_forest_regression_mean_predicted.append(int(np.mean(random_forest_regression_predictions[mask])))

# Create a bar chart of the mean actual and predicted values for each decile
random_forest_regression_lift_plot, random_forest_regression_lift_plot_ax = plt.subplots()
random_forest_regression_lift_plot_ax.bar(np.arange(len(random_forest_regression_mean_actual)), random_forest_regression_mean_actual, label='Actual')
random_forest_regression_lift_plot_ax.plot(np.arange(len(random_forest_regression_mean_predicted)), random_forest_regression_mean_predicted, color='red', linewidth=2, label='Predicted')
random_forest_regression_lift_plot_ax.set_xlabel('Deciles')
random_forest_regression_lift_plot_ax.set_ylabel('Mean')
random_forest_regression_lift_plot_ax.set_title(f'random_forest_regression Decile Analysis Chart')
random_forest_regression_lift_plot_ax.legend()
plt.show(block=False)

In [None]:
model_comparison_list.append(random_forest_regression_performance_metrics)##### End of Model Pipeline for Random Forest Regression #####
##### Model Comparison #####

table = pd.concat(model_comparison_list)
table = table.sort_values(by=['value'], ascending=False)
table = table[table['metric'] == 'r2_score']
print(table)
print(f"The best model is {table['model'].iloc[0]} with {table['value'].iloc[0]} as {table['metric'].iloc[0]}")


# Predict test data using the best model
test_predictions = eval(table['model'].iloc[0]+"_best_estimator").predict(prediction_df)
print('Predictions from best model are stored in test_predictions')