The following code is used to perfom EDA , conversion of string to numerical values and doing encoding for categorical columns.

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm

# Read the CSV file
df = pd.read_csv('day.csv')

#EDA steps  : -

# Convert numeric values with specific labels into categorical string values
df['season'] = df['season'].astype(str).replace({'1': 'spring', '2': 'summer', '3': 'fall', '4': 'winter'})
# Similarly, convert other numeric columns as required

# Perform one-hot encoding for categorical columns
df = pd.get_dummies(df, columns=['season', 'weathersit'])  # Adjust columns as needed

# Define features (X) and target variable (y)
X = df.drop(['cnt', 'dteday'], axis=1)  # Drop 'cnt' and 'dteday' as we can't use them as features
y = df['cnt']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit scaler to training data and transform training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize RFE
model = LinearRegression()
rfe = RFE(model, n_features_to_select=5)  # Select top 5 features
rfe = rfe.fit(X_train_scaled, y_train)

# Creating X_train dataframe with RFE selected variables
X_train_rfe = X_train.loc[:, rfe.support_]
X_train_rfe = sm.add_constant(X_train_rfe)

# Running the linear model with statsmodels
lm = sm.OLS(y_train, X_train_rfe).fit()

# Let's see the summary of our linear model
print(lm.summary())


                            OLS Regression Results                            
Dep. Variable:                    cnt   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 8.552e+31
Date:                Thu, 02 May 2024   Prob (F-statistic):               0.00
Time:                        08:22:36   Log-Likelihood:                 14829.
No. Observations:                 584   AIC:                        -2.965e+04
Df Residuals:                     578   BIC:                        -2.962e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3.126e-12    3.3e-13      9.484      0.0

This code will calculate VIF and perform Residual Analysis and  Make predictions .

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt

# Read the CSV file
df = pd.read_csv('day.csv')

# Map categorical variables to numerical values
season_map = {1: 'spring', 2: 'summer', 3: 'fall', 4: 'winter'}
weather_map = {1: 'clear', 2: 'misty', 3: 'light rain/snow', 4: 'heavy rain/snow'}
df['season'] = df['season'].map(season_map)
df['weathersit'] = df['weathersit'].map(weather_map)

# Create dummy variables for categorical columns
df = pd.get_dummies(df, columns=['season', 'weathersit'], drop_first=True)

# Split the data into training and testing sets
df_train, df_test = train_test_split(df, train_size=0.7, test_size=0.3, random_state=100)

# Apply MinMax scaling to numeric columns
scaler = MinMaxScaler()
num_vars = ['temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

# Divide into X and Y sets for model building
y_train = df_train.pop('cnt')
X_train = df_train

# Drop the 'dteday' column
X_train = X_train.drop('dteday', axis=1)

# Running RFE with 10 variables
lm = LinearRegression()

# Running RFE with the output number of the variable equal to 10
rfe = RFE(lm)
rfe = rfe.fit(X_train, y_train)

# Selecting the columns supported by RFE
col = X_train.columns[rfe.support_]

# Building the model using statsmodels
X_train_rfe = X_train[col]
X_train_rfe = sm.add_constant(X_train_rfe)


X_train_rfe = X_train_rfe.apply(pd.to_numeric)

X_train_rfe = sm.add_constant(X_train_rfe)


print(X_train_rfe.dtypes)

# Convert non-numeric columns to numeric data types
X_train_rfe = X_train_rfe.apply(pd.to_numeric, errors='coerce')

# Drop rows with missing values
X_train_rfe = X_train_rfe.dropna()

# Add constant variable
X_train_rfe = sm.add_constant(X_train_rfe)


lm = sm.OLS(y_train, X_train_rfe).fit()

# Print the summary of the model
print(lm.summary())

# Drop insignificant variables based on p-values
X_train_new = X_train_rfe.drop(['holiday', 'weekday', 'workingday', 'season_summer', 'season_fall'], axis=1)

# Rebuild the model without insignificant variables
X_train_lm = sm.add_constant(X_train_new)
lm = sm.OLS(y_train, X_train_lm).fit()

# Print the summary of the updated model
print(lm.summary())

# Drop the constant column
X_train_new.drop(['const'], axis=1, inplace=True)

# Calculate VIF for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)

# Residual Analysis
y_train_price = lm.predict(X_train_lm)
residuals = y_train - y_train_price
sns.distplot(residuals)

# Apply scaling to test data
df_test[num_vars] = scaler.transform(df_test[num_vars])

# Divide into X_test and y_test
y_test = df_test.pop('cnt')
X_test = df_test

# Create X_test_new dataframe with selected columns
X_test_new = X_test[X_train_new.columns]
X_test_new = sm.add_constant(X_test_new)

# Make predictions
y_pred = lm.predict(X_test_new)

# Model Evaluation
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Count')
plt.ylabel('Predicted Count')
plt.title('Actual Count vs Predicted Count')
plt.show()
