#### Utility Method
This method returns a sorted pairs of correlated features, by flattening the correlation matrix.

In [25]:
def get_correlation (df):
    # Compute the correlation matrix
    corr_matrix = df.select_dtypes(include=['number']).corr()

    # Unstack the correlation matrix into a DataFrame of column pairs
    corr_pairs = corr_matrix.unstack()

    # Drop the diagonal elements (correlation of a column with itself)
    corr_pairs = corr_pairs[corr_pairs.index.get_level_values(0) != corr_pairs.index.get_level_values(1)]

    # Sort the pairs by absolute correlation value in descending order
    return corr_pairs.abs().sort_values(ascending=False)

This method utilizes GridSearchCV method to train and tune the hyper parameters of a model and optimize the model by minimizing the mean squared error. It returns the training duration and optimized model after training.

In [62]:
def train_and_validate_model(pipe, params, X_train, X_test, y_train, y_test):
    grid_search = GridSearchCV(pipe, param_grid= params, cv=5)

    # Fit GridSearchCV to the training data
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    duration = time.time() - start_time

    print(f'## Regressor: {grid_search.best_estimator_.named_steps['regressor']} Training Report ##')
    print("Training time: ", duration, "seconds")
    print("Best Parameters: ", grid_search.best_params_)
    print("Best training Accuracy:", grid_search.best_score_)

    # Test the model on the test set
    test_score = grid_search.score(X_test, y_test)
    print("Test Set Accuracy:", test_score)

    return grid_search, duration

This method takes already split train and testing data and trains a set of models like Linear regression, Ridge regression and Lasso regression and evalutes the accuracy and performance of these models. 

In [27]:
def validate_models(X_train, X_test, y_train, y_test):
  # Define MSE scorer
  mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

  # Define models to compare
  models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso()
  }

  # Define parameter grids for each model
  param_grids = {
    'LinearRegression': {
        'poly__degree': [1, 2],
        'regressor__fit_intercept': [True, False]
    },
    'Ridge': {
        'poly__degree': [1, 2],
        'regressor__alpha': [0.1, 1.0, 10.0],
        'regressor__fit_intercept': [True, False]
    },
    'Lasso': {
        'poly__degree': [1, 2],
        'regressor__alpha': [0.1, 0.5, 1.0],
        'regressor__fit_intercept': [True, False]
    }
  }

  # Loop over models and parameter grids
  results = []
  for model_name, model in models.items():
    print(f"\nRunning GridSearchCV for {model_name}...")

    # Define a pipeline with StandardScaler, PolynomialFeatures, and the model
    pipeline = Pipeline([
        ('poly', PolynomialFeatures()),      # Polynomial features
        ('regressor', model)                 # Regression model (will vary)
    ])

    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring=mse_scorer, return_train_score=True)

    # Fit GridSearchCV
    grid_search.fit(X_train, y_train.ravel())

    # Store results
    cv_results = pd.DataFrame(grid_search.cv_results_)
    cv_results['model'] = model_name  # Add model name to results
    results.append(cv_results)

  # Concatenate all results for comparison
  results_df = pd.concat(results, ignore_index=True)

  # Select important columns for comparison
  comparison_columns = [
    'model', 'param_poly__degree', 'param_regressor__fit_intercept', 'param_regressor__alpha',
    'mean_test_score', 'std_test_score', 'mean_train_score'
  ]

  # Display the results sorted by mean test score (MSE)
  #print("\nComparison of MSE between different models:")
  #print(results_df[comparison_columns].sort_values(by='mean_test_score',       ascending=True))
  return results_df