In [1]:
# import basic packages for data analysis
import numpy as np
import pandas as pd
import statsmodels.api as sm 
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [2]:
clean_set = pd.read_csv('/path/')

Use forward and backward selection method to determine the best linear regression model.

In [3]:
def forward_selection(X_train, y_train, significance_level=0.05):
    # Start with no predictors
    initial_features = X_train.columns.tolist()
    best_features = []
    
    while len(initial_features) > 0:  # Iterate over the set of all features
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features)
        
        for new_column in remaining_features: # Iterate over the remaining features not yet included
            # Fit model with the selected features and one additional feature
            model = sm.OLS(y_train, sm.add_constant(pd.DataFrame(X_train[best_features + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        
        min_p_value = new_pval.min()
        if min_p_value < significance_level: # If the p-value is below the significance level, add it to the best features
            best_features.append(new_pval.idxmin())
        else:
            break

    return best_features, len(best_features)

In [4]:
def backward_elimination(X_train, y_train, significance_level=0.05):
    # Start with all predictors
    features = X_train.columns.tolist()
    
    # Iterate as long as there are features to consider
    while len(features) > 0:
        features_with_constant = sm.add_constant(X_train[features])
        p_values = sm.OLS(y_train, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max()
        
        # If the max p-value is above the significance level, remove that feature
        if max_p_value >= significance_level:
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
        else:
            break 
            
    return features, len(features)

In [5]:
# Data split
# Define features and target
features = clean_set.drop('airTemp', axis=1) # 11 variables
target = clean_set['airTemp'] # predict air temperature

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=141)
# 20% test data 80% training data

In [6]:
# Forward selection
selected_features_forward, count_forward = forward_selection(X_train, y_train)
print("Selected features by forward selection:", selected_features_forward)
print("Number of features selected:", count_forward)

# Backward elimination
selected_features_backward, count_backward = backward_elimination(X_train, y_train)
print("Selected features by backward elimination:", selected_features_backward)
print("Number of features selected:", count_backward)

Selected features by forward selection: ['atmosPStatn', 'tPrec', 'date_time', 'airHum', 'long', 'lat', 'windSp', 'rad', 'prevHrMinTemp', 'dpTemp', 'windDir']
Number of features selected: 11
Selected features by backward elimination: ['tPrec', 'atmosPStatn', 'rad', 'dpTemp', 'prevHrMinTemp', 'airHum', 'windDir', 'windSp', 'lat', 'long', 'date_time']
Number of features selected: 11


The forward and backward stepwise selection methods are returning all 11 variables as significant, it suggests that each variable in our dataset might be contributing meaningfully to the prediction of air temperature.

In [7]:
# Function to train and evaluate a model
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    # Train the model
    model.fit(X_train, y_train)
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate performance 
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return mse, mae, r2

In [8]:
# Initialize a linear regression model
linear_model = LinearRegression()

# Initialize a ridge regression model
ridge_model = Ridge(alpha = 1.0, random_state = 141)  

# Initialize a lasso regression model
lasso_model = Lasso(alpha = 1.0, random_state = 141)    

In [9]:
# Evaluate Linear Regression
linear_mse, linear_mae, linear_r2 = train_and_evaluate(linear_model, X_train, X_test, y_train, y_test)

# Evaluate Ridge Regression
ridge_mse, ridge_mae, ridge_r2 = train_and_evaluate(ridge_model, X_train, X_test, y_train, y_test)

# Evaluate Lasso Regression
lasso_mse, lasso_mae, lasso_r2 = train_and_evaluate(lasso_model, X_train, X_test, y_train, y_test)

In [10]:
print("Linear Regression - MSE:", linear_mse, "MAE:", linear_mae, "R2:", linear_r2)
print("Ridge Regression - MSE:", ridge_mse, "MAE:", ridge_mae, "R2:", ridge_r2)
print("Lasso Regression - MSE:", lasso_mse, "MAE:", lasso_mae, "R2:", lasso_r2)

Linear Regression - MSE: 0.34643680583595826 MAE: 0.4366569177176155 R2: 0.9729379484547314
Ridge Regression - MSE: 0.34643680598159887 MAE: 0.436656910150719 R2: 0.9729379484433546
Lasso Regression - MSE: 0.5992948937419601 MAE: 0.5605001171396021 R2: 0.9531858364005911


The MSE is lower for both Linear and Ridge Regression compared to Lasso Regression. A lower MSE indicates better performance, as it means the predictions are closer to the actual values.

Similar to MSE, the MAE is lower for Linear and Ridge Regression compared to Lasso Regression. Lower MAE means the average magnitude of errors in the predictions is smaller.

The R-squared value is a measure of how well the independent variables explain the variance in the dependent variable. Higher R-squared values indicate better model performance. Here, both Linear and Ridge Regression have higher R-squared values compared to Lasso Regression.

Linear and Ridge Regression are performing almost identically and significantly better than Lasso Regression for our dataset, as indicated by the lower MSE and MAE, and higher R-squared values.

The similarity in performance between Linear and Ridge Regression suggests that the dataset might not have significant multicollinearity issues or that the effect of regularization in Ridge Regression is minimal at the chosen alpha value.

Given these results, Linear or Ridge Regression may be the preferred model for this dataset.