### Importing required dependencies

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_absolute_error, r2_score

### Loading train and test datasets

In [None]:
train = pd.read_csv(r'D:\AI\Education\PGD in Ai and ML\dataset\ml\train.csv') #my local path

test = pd.read_csv(r'D:\AI\Education\PGD in Ai and ML\dataset\ml\test.csv') #my local path

print("Training Data Sample:")
print(train.head())

print("\nTesting Data Sample:")
print(test.head())

### Getting shape of train and test datasets

In [None]:
train_shape = train.shape
print("Training Data Shape:", train_shape)

test_shape = test.shape
print("Testing Data Shape:", test_shape)

### Datatypes of each column in train and test

In [None]:
train_dtype = train.info()
print("Training Datatypes:", train_dtype)

print("=====================================================================================")

test_dtype = test.info()
print("Testing Datatypes:", test_dtype)

### Find missing columns in train and test datasets

In [None]:
train_missing_values = train.isnull().sum()
print("Missing Values in Training Data:")
print(train_missing_values)

test_missing_values = test.isnull().sum()
print("\nMissing Values in Testing Data:")
print(test_missing_values)

### Creating new columns date, hour, weekDay, month from datetime column

In [None]:
train['datetime'] = pd.to_datetime(train['datetime'])

train['date'] = train['datetime'].dt.date
train['hour'] = train['datetime'].dt.hour
train['weekDay'] = train['datetime'].dt.dayofweek
train['month'] = train['datetime'].dt.month

print(train[['datetime', 'date', 'hour', 'weekDay', 'month']].head())

test['datetime'] = pd.to_datetime(test['datetime'])

test['date'] = test['datetime'].dt.date
test['hour'] = test['datetime'].dt.hour
test['weekDay'] = test['datetime'].dt.dayofweek
test['month'] = test['datetime'].dt.month

print(test[['datetime', 'date', 'hour', 'weekDay', 'month']].head())

### Coercing the datatype of season, holiday, workingday, and weather to category

In [None]:
category_columns = ['season', 'holiday', 'workingday', 'weather']

train[category_columns] = train[category_columns].astype('category')

test[category_columns] = test[category_columns].astype('category')

print("Data Types in Training Data:")
print(train.dtypes)

print("\nUpdated Training Data Head:")
print(train.head())

print("Data Types in Test Data:")
print(test.dtypes)

print("\nUpdated Test Data Head:")
print(test.head())

### Droping the 'datetime' column

In [None]:
train = train.drop('datetime', axis=1)

test = test.drop('datetime', axis=1)

### Printing all columns

In [None]:
print(train.columns)
print(test.columns)

### Outlier Analysis:  Box plots across various features

In [None]:
sns.set(style="whitegrid")

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

# Box plot for season
sns.boxplot(x='season', y='Total_booking', data=train, ax=axes[0, 0])
axes[0, 0].set_title('Box Plot of Total Booking vs. Season')

# Box plot for hour
sns.boxplot(x='hour', y='Total_booking', data=train, ax=axes[0, 1])
axes[0, 1].set_title('Box Plot of Total Booking vs. Hour of the Day')

# Box plot for workingday
sns.boxplot(x='workingday', y='Total_booking', data=train, ax=axes[1, 0])
axes[1, 0].set_title('Box Plot of Total Booking vs. Working Day')

# Box plot for weather
sns.boxplot(x='weather', y='Total_booking', data=train, ax=axes[1, 1])
axes[1, 1].set_title('Box Plot of Total Booking vs. Weather')

train = train.loc[:,~train.columns.duplicated()]

plt.tight_layout()

plt.show()

I observed outliers in the relationship between seasons and total bookings. Specifically, during the fall season, total bookings exceeded 800 tickets, in spring, it exceeded 400, in summer, it surpassed 775, and in winter, it went beyond 680. These outliers suggest varying patterns of cab demand throughout the year.

In the analysis of hourly data for cab bookings, several outliers were identified in specific time intervals. Outliers were observed during the hours of 10:00 AM to 3:00 PM, as well as at 11:00 AM and 12:00 PM, and during the early morning hours from 1:00 AM to 3:00 AM. These outliers indicate unusual patterns in cab booking demand during these time slots.

Certainly, in the analysis of cab bookings, it was observed that on non-working days, there were outliers with total bookings exceeding 700 bookings, while on working days, outliers occurred when total bookings surpassed 600 tickets.

The absence of outliers in holiday-related data, coupled with the presence of outliers in non-holiday data, suggests that there may be distinctive patterns in cab bookings between holidays and regular days. It's possible that during holidays, the demand for cab services remains relatively consistent and doesn't exhibit extreme variations, leading to the absence of outliers. On the other hand, during non-holidays, various factors such as work commutes, events, or other activities may lead to fluctuations in cab demand, resulting in the identification of outliers.

### Outlier Analysis:  Removing the outliers

In [None]:
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

columns_to_remove_outliers = ['temp', 'atemp', 'humidity', 'windspeed']

for column in columns_to_remove_outliers:
    train = remove_outliers_iqr(train, column)

print("Shape of the Dataset after Removing Outliers:", train.shape)

### Correlation Analysis:  Plot a correlation plot between "total booking" and ["temp", "atemp", "humidity", "windspeed"]

In [None]:
selected_columns = ["Total_booking", "temp", "atemp", "humidity", "windspeed"]

subset_train = train[selected_columns]

correlation_matrix = subset_train.corr()

# Heatmap of the correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

## Analysis and Inference from Correlation Heatmap:

### Total Booking vs. Temperature (temp and atemp):
There is a moderate positive correlation between Total_booking and both temp and atemp (0.39 and 0.39, respectively). This suggests that as the temperature increases, there is a tendency for an increase in cab bookings. The correlation between temp and atemp is very high (0.99), indicating that these two variables are essentially measuring the same thing.

### Total Booking vs. Humidity:
There is a moderate negative correlation between Total_booking and humidity (-0.32). This implies that as humidity increases, cab bookings tend to decrease. High humidity might deter people from using cab services.

### Total Booking vs. Windspeed:
There is a weak positive correlation between Total_booking and windspeed (0.09). The correlation is not very strong, suggesting that windspeed has only a slight influence on cab bookings. Other factors likely play a more significant role.

## Inference:

### Weather Impact:
The positive correlation between temperature (temp and atemp) and cab bookings indicates that people are more inclined to book cabs on warmer days. Conversely, the negative correlation with humidity suggests that people are less likely to book cabs on humid days, possibly due to more comfortable outdoor conditions.

### Windspeed's Limited Impact:
Windspeed shows a weak positive correlation with cab bookings, suggesting that it has a relatively minor impact. Other factors, such as temperature and humidity, likely have a more substantial influence on demand.

### Multicollinearity:
temp and atemp exhibit a very high correlation, indicating multicollinearity. When building predictive models, it might be beneficial to choose one of these variables to avoid redundancy.


### Data Visualization: total_booking column and plotting the probability distribution plot

In [None]:
# Histogram to visualize the distribution of the total booking column
plt.figure(figsize=(10, 6))
sns.histplot(train["Total_booking"], kde=True, color="blue")
plt.title("Distribution of Total Booking")
plt.xlabel("Total Booking")
plt.ylabel("Frequency")
plt.show()

# Probability distribution plot using kernel density estimate
plt.figure(figsize=(8, 4))
sns.kdeplot(train["Total_booking"], shade=True, color="green")
plt.title("Probability Distribution of Total Booking")
plt.xlabel("Total Booking")
plt.ylabel("Probability Density")
plt.show()

### Visualizing total_booking vs (Month, Season, Hour, Weekday, Usertype)

In [None]:
sns.set(style="whitegrid")

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 10))

# Total Booking vs Month
sns.barplot(x='month', y='Total_booking', data=train, ax=axes[0, 0])
axes[0, 0].set_title('Total Booking vs. Month')
axes[0, 0].set_xlabel('Month')
axes[0, 0].set_ylabel('Total Booking')

# Total Booking vs Season
sns.barplot(x='season', y='Total_booking', data=train, ax=axes[0, 1])
axes[0, 1].set_title('Total Booking vs. Season')
axes[0, 1].set_xlabel('Season')
axes[0, 1].set_ylabel('Total Booking')

# Total Booking vs Hour
sns.barplot(x='hour', y='Total_booking', data=train, ax=axes[0, 2])
axes[0, 2].set_title('Total Booking vs. Hour of the Day')
axes[0, 2].set_xlabel('Hour')
axes[0, 2].set_ylabel('Total Booking')

# Total Booking vs Weekday
sns.barplot(x='weekDay', y='Total_booking', data=train, ax=axes[1, 0])
axes[1, 0].set_title('Total Booking vs. Weekday')
axes[1, 0].set_xlabel('Weekday')
axes[1, 0].set_ylabel('Total Booking')

# Removing the empty subplot
fig.delaxes(axes[1, 2])

plt.tight_layout()

plt.show()


### Using Histograms to plot all the continuous variables in the data

In [None]:
continuous_variables = ["temp", "atemp", "humidity", "windspeed"]

sns.set(style="whitegrid")

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

for i, var in enumerate(continuous_variables):
    row, col = i // 2, i % 2
    sns.histplot(train[var], ax=axes[row, col], kde=True, color="blue")
    axes[row, col].set_title(f'Histogram of {var}')
    axes[row, col].set_xlabel(var)
    axes[row, col].set_ylabel('Total_booking')

plt.tight_layout()

plt.show()

### Converting the categorical variables into one hot vector using pd.get_dummies

In [None]:
categorical_variables = ['season', 'holiday', 'workingday', 'weather']

train = pd.get_dummies(train, columns=categorical_variables, drop_first=True)

test = pd.get_dummies(test, columns=categorical_variables, drop_first=True)

In [None]:
print(train.head())
print(test.head())

### Splitting data into train and test

In [None]:
data = pd.read_csv(r'D:\AI\Education\PGD in Ai and ML\dataset\ml\train.csv') # My local path

X = train.drop(columns=['Total_booking'])
y = train['Total_booking']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

### Fitting Random Forest Regressor, Ada Boost Regressor, Bagging Regressor, SVR, and K-Neighbors Regressor

In [None]:
train.info()

In [None]:
X_train = X_train.drop('date', axis=1)
X_test = X_test.drop('date', axis=1)

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

models = {
    "Random Forest Regressor": RandomForestRegressor(),
    "Ada Boost Regressor": AdaBoostRegressor(),
    "Bagging Regressor": BaggingRegressor(),
    "SVR": SVR(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    # r2 Score
    
    mae = mean_absolute_error(y_test, y_pred)
    print(f"{model_name} MAE: {mae}")

    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} R2 Score: {r2}")
    
    print("===========================================================")
    
    

The Random Forest Regressor outperforms all other models with the lowest MAE of 43.25 and the highest R2 Score of 0.8679. This indicates that the Random Forest model has the smallest prediction errors and explains a significant portion of the variance in the data. It is the top-performing model in terms of both accuracy and fit.

### Displaying a factor plot to visualize the RMSE values achieved by different modeling algorithm

In [None]:
import numpy as np
rmse_values = []

for model_name, model in models.items():
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    rmse_values.append((model_name, rmse))

rmse_df = pd.DataFrame(rmse_values, columns=['Model', 'RMSE'])

colors = sns.color_palette("Set2")

plt.figure(figsize=(8, 6))  
sns.lineplot(data=rmse_df, x='Model', y='RMSE', palette=colors)
plt.title('RMSE by Modeling Algorithm')
plt.xlabel('Model')
plt.ylabel('RMSE')
plt.xticks(rotation=45)

plt.show()

###  Hyper-parameter tuning on the best model using GridSearchCV and Printing best parameters

In [None]:
model = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Create the gridcv object

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_

# Making predctions using the best model
y_pred = best_model.predict(X_test)

# Calculate rmse using the best model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE using best model:", rmse)

###  Prediction on the test set and print the mean_squared_log_error

In [None]:
best_model = RandomForestRegressor(random_state=42)
best_model.fit(X_train, y_train)

X_test = test_data.drop(columns=['Total_booking'])  
y_test = test_data['Total_booking']  

y_pred = best_model.predict(X_test)

# Calculating the mean squared log error
msle = mean_squared_log_error(y_test, y_pred)

print("Mean Squared Log Error (MSLE):", msle)