In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Helper function
def findMinMax(data):
  q1 = data.quantile(0.25)
  q3 = data.quantile(0.75)
  min = q1 - 1.5 * (q3 - q1)
  max = q3 + 1.5 * (q3 - q1)
  return min, max

# Function to help us visualize the outliers. Produce boxplot and print out outliers
def getOutliers(data, features):
  for i, feature in enumerate(features, 1):
    plt.subplot(1,len(features),i)
    data[[feature]].boxplot()
    
    min, max = findMinMax(data[feature])
    outliers_lower = data[feature] < min
    outliers_upper = data[feature] > max
    
    if outliers_lower.any():
      print(feature, "- Lower outliers:\n", data.loc[outliers_lower, feature])
    if outliers_upper.any():
      print(feature, "- Upper outliers:\n", data.loc[outliers_upper, feature])

  plt.show()
  

# Return a new set of data with outliers removed
def removeOutliers(data, features):
  removeIdx = pd.Series([False] * len(data))
  for i, feature in enumerate(features):
    min, max = findMinMax(data[feature])
    outliers_lower = data[feature] < min
    outliers_upper = data[feature] > max
    
    removeIdx = removeIdx | outliers_lower | outliers_upper

  return newData.loc[~removeIdx]
  


# Testing code
data = pd.DataFrame(pd.read_csv('./SeoulBikeData.csv'))
features = ['Rented Bike Count', 'Wind speed (m/s)']

getOutliers(data, features)

newData = removeOutliers(data, features)
getOutliers(newData, features)

In [None]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# Note, need to modify X as appropriate for the different types of regression by adding column of 1's or any other parameters as neccessary
def perform_training(X, y, folds, test_size=0.25, model_type):
    # Define number of folds for cross-validation
    kf = KFold(folds)

    # Initialize lists to store results for variance, bias2s, total_error, and models
    # FINISH
    total_error = [] # list for total_error
    models = [] # list for models
    mses = []

    # Perform cross-validation
    for train_index, test_index in kf.split(X_poly):
        # Split data into training and testing sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Fit polynomial regression model
        model = model_type()
        model.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = model.predict(X_test)

        # Calculate MSE for new model prediction
        mse = mean_squared_error(y_pred, y_test)
        
        # Append results to lists
        # FINISH
        mses.append(mse)
        models.append(model)

    # print the total_error of the best model
    min_error_index = np.argmin(mses)
    best_model = models[min_error_index]
    mse = mses[min_error_index]
    
    return mse, best_model

# Testing Code


In [None]:
# Cell 2 Here

In [None]:
# Cell 3 Here