In [1]:
import pandas
import numpy
import warnings
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor

warnings.simplefilter('ignore')

In [13]:
# ------------------------------------ HELPER FUNCTIONS ------------------------------------
# - The multicollinearity function returns the VIF values of the features
# - The TrainedModel function trains a linear regression model on the provided features and labels
# - The find_most_correlated_columns function does what the name suggests, it takes in a correlation
#   matrix as the input.
# - The relevantFeatures function gives the feature lists, sorted according to it's relevance

def multicollinearity(X, y, threshold=5):
    vif_values = pandas.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)
    return vif_values

def TrainedModel (X, y) :
  model = LinearRegression()
  model.fit(X, y)
  return model

def find_most_correlated_columns(corr_matrix, threshold=0.6):
    corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            col1 = corr_matrix.columns[i]
            col2 = corr_matrix.columns[j]
            corr_value = abs(corr_matrix.loc[col1, col2])
            if corr_value >= threshold:
                corr_pairs.append((col1, col2, corr_value))
    return sorted(corr_pairs, key=lambda x: x[2], reverse=True)

# In order to select the most relevant features, a lieanr regresson is run with each feature
# and label, and the F-statistic and p-value are noted. The features are than sorted according
# to these values.

def relevantFeatures (X, y) :
  feature_variances = {}
  for column in X.columns:
    feature = X[[column]]
    F, p = f_regression(feature, y)
    feature_variances[column] = F[0]

  sorted_features = sorted(feature_variances.items(), key=lambda x: x[1], reverse=True)
  for feature, variance in sorted_features:
    print(f"{feature}: {variance:.2f}")

In [None]:
# After importing the data, some unnecessary columns are dropped. The above mentioned functions
# are used to find out which features are most relevant and which ones are most correlated with
# the label, count. The three selected features are registered, casual and temp+atemp (since there's
# also a high multicllinearity between temp and atemp).

data = pandas.read_csv('Data.csv')
columns_to_exclude = ['instant', 'dteday']
columns_to_encode = ['season', 'weekday', 'month']
data = data.drop(columns_to_exclude, axis=1)
data = pandas.get_dummies(data, columns=columns_to_encode, drop_first=True)

for i in find_most_correlated_columns(data.corr()) :
  print(i)
print()

relevantFeatures(data.drop(columns=['count']), data['count'])

In [15]:
# Using the selected features, a multiple regression model is trained

X = pandas.DataFrame()
X['overallTemp'] = (data['temp'] + data['atemp']) / 2
X['registered'] = data['registered']
X['casual'] = data['casual']
y = data['count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
# The predicted values are compared in order to gauge the model efficiency.
# The R-squared and MSE values are calculated as well.

model = TrainedModel (X_train, y_train)
y_pred = model.predict(X_test)

print(f'R2: {r2_score(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')

R2: 1.0
MSE: 7.343352544145842e-24
