In [None]:
# Bike Rental Demand Prediction: Data Exploration
# Purpose:
# - Load and inspect datasets
# - Understand feature types and data quality
# - Identify preprocessing steps needed before modeling

In [2]:
import pandas as pd

In [None]:
# Load datasets
df1=pd.read_csv('../data/samSub.csv')  # sample submission
df2=pd.read_csv('../data/raw/train.csv')   # training data
df3=pd.read_csv('../data/raw/test.csv')    # test data

In [5]:
# Display the first 5 rows of the training dataset to get a quick overview
df2.tail(25)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
10861,2012-12-18 23:00:00,4,0,1,1,13.94,17.425,49,6.0032,1,80,81
10862,2012-12-19 00:00:00,4,0,1,1,12.3,15.91,61,0.0,6,35,41
10863,2012-12-19 01:00:00,4,0,1,1,12.3,15.91,65,6.0032,1,14,15
10864,2012-12-19 02:00:00,4,0,1,1,11.48,15.15,65,6.0032,1,2,3
10865,2012-12-19 03:00:00,4,0,1,1,10.66,13.635,75,8.9981,0,5,5
10866,2012-12-19 04:00:00,4,0,1,1,9.84,12.12,75,8.9981,1,6,7
10867,2012-12-19 05:00:00,4,0,1,1,10.66,14.395,75,6.0032,2,29,31
10868,2012-12-19 06:00:00,4,0,1,1,9.84,12.88,75,6.0032,3,109,112
10869,2012-12-19 07:00:00,4,0,1,1,10.66,13.635,75,8.9981,3,360,363
10870,2012-12-19 08:00:00,4,0,1,1,9.84,12.88,87,7.0015,13,665,678


In [7]:
# Get summary statistics of numeric columns in the training dataset
# Includes count, mean, std, min, 25%, 50%, 75%, and max
df2.describe()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,2.506614,0.028569,0.680875,1.418427,20.23086,23.655084,61.88646,12.799395,36.021955,155.552177,191.574132
std,1.116174,0.166599,0.466159,0.633839,7.79159,8.474601,19.245033,8.164537,49.960477,151.039033,181.144454
min,1.0,0.0,0.0,1.0,0.82,0.76,0.0,0.0,0.0,0.0,1.0
25%,2.0,0.0,0.0,1.0,13.94,16.665,47.0,7.0015,4.0,36.0,42.0
50%,3.0,0.0,1.0,1.0,20.5,24.24,62.0,12.998,17.0,118.0,145.0
75%,4.0,0.0,1.0,2.0,26.24,31.06,77.0,16.9979,49.0,222.0,284.0
max,4.0,1.0,1.0,4.0,41.0,45.455,100.0,56.9969,367.0,886.0,977.0


In [8]:
# Count null values in the training dataset
df2.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

In [9]:
# Count null values in the testing dataset
df3.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
dtype: int64

In [10]:
# Get datatypes of each column in the dataset 
df2.dtypes

datetime       object
season          int64
holiday         int64
workingday      int64
weather         int64
temp          float64
atemp         float64
humidity        int64
windspeed     float64
casual          int64
registered      int64
count           int64
dtype: object

In [11]:
# Features available for testing
df3.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed'],
      dtype='object')

In [12]:
# Features (registered and casual) will cause data-leakage since they would not be available in real life data and hence we must drop them from our training dataset
# This dact is also justified bythe absence of these twofeatures from the testing dataset

In [14]:
# Figuring out which season is the coldest and which is the hottest 
temp_mean_season=df2.groupby('season')['temp'].mean()
temp_median_season=df2.groupby('season')['temp'].median()
print("Temp mean in different seasons", temp_mean_season)
print("Temp median in different seasons", temp_median_season)

Temp mean in different seasons season
1    12.530491
2    22.823483
3    28.789111
4    16.649239
Name: temp, dtype: float64
Temp median in different seasons season
1    12.30
2    22.96
3    28.70
4    16.40
Name: temp, dtype: float64


In [None]:
# So the order of seasons in increasing oerder of temprature is 1<4<2<3

In [16]:
temp_mean_weather=df2.groupby('weather')['temp'].mean()
temp_median_weather=df2.groupby('weather')['temp'].median()
print("Temp mean in different weathers", temp_mean_weather)
print("Temp median in different weathers", temp_median_weather)

Temp mean in different weathers weather
1    20.557122
2    19.614608
3    19.546356
4     8.200000
Name: temp, dtype: float64
Temp median in different weathers weather
1    20.50
2    18.86
3    19.68
4     8.20
Name: temp, dtype: float64


In [None]:
# So the order of seasons in increasing order of temprature is 4<3<2<1

In [None]:
# Consider dropping eithe of the two season or weather as they are highly similar in trends with each other and temprature features
# Although season and weather both represent climatic conditions, they capture different aspects of variability (long-term vs short-term). Keeping both may introduce some redundancy, but regularization methods (Ridge/Lasso) are designed to handle such multicollinearity by shrinking or eliminating less useful coefficients. Thus, we retain both features to demonstrate the effect of regularization.”