In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Helper function
def findMinMax(data):
  q1 = data.quantile(0.25)
  q3 = data.quantile(0.75)
  min = q1 - 1.5 * (q3 - q1)
  max = q3 + 1.5 * (q3 - q1)
  return min, max

# Function to help us visualize the outliers. Produce boxplot and print out outliers
def getOutliers(data, features):
  for i, feature in enumerate(features, 1):
    plt.subplot(1,len(features),i)
    data[[feature]].boxplot()
    
    min, max = findMinMax(data[feature])
    outliers_lower = data[feature] < min
    outliers_upper = data[feature] > max
    
    if outliers_lower.any():
      print(feature, "- Lower outliers:\n", data.loc[outliers_lower, feature])
    if outliers_upper.any():
      print(feature, "- Upper outliers:\n", data.loc[outliers_upper, feature])

  plt.show()
  

# return az set of data with outliers removed
def removeOutliers(data, features):
  removeIdx = pd.Series([False] * len(data))
  for i, feature in enumerate(features):
    min, max = findMinMax(data[feature])
    outliers_lower = data[feature] < min
    outliers_upper = data[feature] > max
    
    removeIdx = removeIdx | outliers_lower | outliers_upper

  return newData.loc[~removeIdx]
  


# Testing code
data = pd.DataFrame(pd.read_csv('./SeoulBikeData.csv'))
features = ['Rented Bike Count', 'Wind speed (m/s)']

getOutliers(data, features)

newData = removeOutliers(data, features)
getOutliers(newData, features)

In [None]:
from sklearn.preprocessing import StandardScaler

data = pd.DataFrame(pd.read_csv('./SeoulBikeData.csv'))

# Remove rows with non-functioning day / no bike rented
functioningDay = data['Functioning Day'] == 'Yes'
data = data.loc[functioningDay]

# Dropping some features:
# Date: can't process and we already have the holiday feature
# Dew temp: not relevant
# Functioning day: already process
data = data.drop(columns=['Date', 'Dew point temperature', 'Functioning Day'])


# One Hot Encode categorical features
# Hour should be categorical too. Not sure how to handle it yet
data = pd.get_dummies(data, columns=['Seasons'], dtype=int)
data = pd.get_dummies(data, columns=['Holiday'], dtype=int, drop_first=True)


getOutliers(data, ['Rented Bike Count', 'Temperature', 'Humidity', 'Wind speed', 'Visibility', 'Solar Radiation', 'Rainfall', 'Snowfall'])

# It seems like every rainy or snowy days are counted as outliers because the weather is normal most of the time.
# Therefore, not going to remove outliers for Rainfall and Snowfall

# A lot of outliers for Solar Radiation. We can test this out with our models. For now, not removing outliers for this one

# Around 150 outliers for Rented Bike Count and Wind Speed. Remove outliers for now

data = removeOutliers(data, ['Rented Bike Count', 'Temperature', 'Humidity', 'Wind speed', 'Visibility'])
data = data.reset_index(drop=True)
print(data)


# Splitting data between categorical and numericals set for standardization
categoricalFeatures = ['Hour', 'Seasons_Autumn', 'Seasons_Spring', 'Seasons_Summer', 'Seasons_Winter', 'Holiday_No Holiday']
numericalFeatures = ['Rented Bike Count', 'Temperature', 'Humidity', 'Wind speed', 'Visibility', 'Solar Radiation', 'Rainfall', 'Snowfall']
categoricalValues = data[categoricalFeatures]
standardizedData = data.drop(columns=categoricalFeatures)

scaler = StandardScaler()
scaler.fit(standardizedData)
standardizedData = scaler.transform(standardizedData)
standardizedData = pd.DataFrame(standardizedData)
standardizedData.columns = numericalFeatures

standardizedData = pd.concat([standardizedData, categoricalValues], axis=1)
print(standardizedData)

In [None]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# Note, need to modify X as appropriate for the different types of regression by adding column of 1's or any other parameters as neccessary
def perform_training(X, y, folds, test_size=0.25, degrees=[1, 2], alphas=[0.0001, 0.001, 0.01]):
    # Define number of folds for cross-validation
    kf = KFold(folds)

    # Initialize lists to store results for variance, bias2s, total_error, and models
    # FINISH 
    
    total_error = [] # list for total_error
    models = [] # list for models
    mses = []

    # Features are already standardized 

    
    for degree in degrees:
        # create polynomial features
        poly = PolynomialFeatures(degree=degree)
        X_poly = poly.fit_transform(X)
        
        for alpha in alphas:
            # Store MSEs for this configuration, save across models
            fold_mses = []
            
            # Perform cross-validation
            for train_index, test_index in kf.split(X_poly):
                # Split data into training and testing sets for this fold
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
        
                # stochastic gradient descent with L2 aka ridge regression
                model = SGDRegressor(
                                loss='squared_loss', 
                                penalty='l2', 
                                alpha=alpha, 
                                max_iter=100, 
                                eta0=0.01, 
                                random_state=42
                            )
                # Train and predict
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
        
                # Calculate MSE for new model prediction
                mse = mean_squared_error(y_pred, y_test)
                fold_mses.append(mse)
                
            # # Append results to lists
            # # FINISH
            # mses.append(mse)
            # models.append(model)

            # avg mean across the folds for curr degree and alpha
            avg_mse = np.mean(fold_mses)
            
            # Updating the best model if current is better
            if avg_mse < best_mse:
                best_mse = avg_mse
                best_model = model
                best_degree = degree
                best_alpha = alpha
    
    # print the total_error of the best model
    min_error_index = np.argmin(mses)
    best_model = models[min_error_index]
    mse = mses[min_error_index]
    
    print(f"Best Degree: {best_degree}")
    print(f"Best Alpha: {best_alpha}")
    print(f"Best Mean Squared Error: {best_mse}")
    
    return best_model, best_degree, best_alpha

# Testing Code



In [None]:
data =   # From the previous preprocessing function

X = data.drop('Rented Bike Count', axis=1)
y = data['Rented Bike Count']

best_model, best_degree, best_alpha = perform_training(X,y)
