In [None]:
!pip install catboost


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_rows',None)
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from datetime import datetime as dt
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")

storing csv as dataframe

In [None]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

# Data Fields

**datetime** - hourly date + timestamp  

**season**-  1 = spring, 2 = summer, 3 = fall, 4 = winter

**holiday** - whether the day is considered a holiday

**workingday** - whether the day is neither a weekend nor holiday

**weather** - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

**temp** - temperature in Celsius

**atemp**- "feels like" temperature in Celsius

**humidity** - relative humidity

**windspeed** - wind speed

**casual** - number of non-registered user rentals initiated

**registered** - number of registered user rentals initiated

**count** - number of total rentals



In [None]:
train.head()

In [None]:
test.head()

In [None]:
print('Shape of Train: ',train.shape)
print('Shape of Test: ',test.shape)


there are two columns missing in the test.
casual,registered,i will drop them.
count is our target column

In [None]:
train.drop(columns=['casual','registered'],inplace=True)

In [None]:
print('Null values in Train: \n',train.isnull().sum())
print('Null values in Test: \n',test.isnull().sum())

No missing values

combine train and test

In [None]:
df=pd.concat([train,test]).reset_index(drop=True)

In [None]:
df.head()

Lets convert datetime to month and hour

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])



df['hour'] = df['datetime'].dt.hour
df['year'] = df['datetime'].dt.year

df['month'] = df['datetime'].dt.month

df.drop(columns=['datetime'],inplace=True)

In [None]:
df.head()

Plotting continous variables on a heatmap

In [None]:
df0=df[['temp','atemp','humidity','windspeed','count']]

In [None]:
correlation_matrix = df0.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix')
plt.show()

lets plot categorical variables versus count

In [None]:
categorical_variable=['season','weather','holiday','workingday','month','weekday','hour']

In [None]:
plt.figure(figsize=(30,30))
for index,feature in enumerate(categorical_variable):
  plt.subplot(5, 4, index+1)
  sns.barplot(x=feature, y='count', data=df)
  plt.xlabel(f'{feature}')
  plt.ylabel('Categorical Variable')
  plt.title(f'{feature} vs count')
plt.show()

convert month,season,weather and hour into categorical

In [None]:
df_with_dummies = pd.get_dummies(df, columns=['year','month', 'hour','season','weather'], drop_first=True)

plotting continous variables vs count

In [None]:
continous_variable=['temp','humidity','windspeed']
plt.figure(figsize=(30,30))
for index,feature in enumerate(continous_variable):
  plt.subplot(5, 4, index+1)
  plt.scatter(df[feature], df['count'])
  plt.xlabel(f'{feature}')
  plt.ylabel('count')
  plt.title('continous varible vs count')
plt.show()

In [None]:
df.info()

lets find the feature importance also

In [None]:
train_data = df[df['count'].notnull()]
test_data = df[df['count'].isnull()]


X_train = train_data.drop(columns=['count'])
y_train = train_data['count']
X_test = test_data.drop(columns=['count'])


cb_rgr = CatBoostRegressor(silent=True)
cb_rgr.fit(X_train, y_train)

feature_importance = cb_rgr.feature_importances_


feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})


feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, order=feature_importance_df.sort_values('Importance', ascending=False)['Feature'])
plt.title('Feature Importance - Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

this is a regression problem.so lets use the follwing with grid search


*   XGBoost
*   CatBoost
*   Adaboost




In [None]:
print('df_final shape:', df.shape)
print('df_train shape:', train.shape)
print('df_test shape:',  test.shape)

In [None]:
X_Train = pd.DataFrame(df[:10886])
X_Test  = pd.DataFrame(df[10886:])
Y_Train = train['count']

In [None]:
print('\nCheck that the datasets are consistent:\n')
print('X_train shape', X_Train.shape)
print('Y_train shape:', Y_Train.shape)
print('X_test shape:',  X_Test.shape)


In [None]:
X_Train.drop(columns=['count'],inplace=True)
X_Test.drop(columns=['count'], inplace=True)

In [None]:
scaler = StandardScaler()
scaler.fit(X_Train)
X_train_scaled = scaler.transform(X_Train)
X_test_scaled = scaler.transform(X_Test)

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_scaled, Y_Train, train_size=0.9, test_size=0.1,random_state=0)

In [None]:


models = {
    'XGBoost': XGBRegressor(),
    'CatBoost': CatBoostRegressor(silent=True),


}


param_grids = {
    'XGBoost': {
        'n_estimators': [100,150, 200,250, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [4, 5,6]
    },
    'CatBoost': {
        'iterations': [100,150, 200,250, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'depth': [5, 6, 7]
    }

    }

results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(model, param_grids[model_name], scoring='neg_mean_squared_error', cv=6)
    grid_search.fit(X_train, Y_train)
    best_model = grid_search.best_estimator_


    y_pred = abs(best_model.predict(X_valid))
    rmsle = np.sqrt(metrics.mean_squared_log_error(Y_valid,y_pred))
    results[model_name] = rmsle
    print(f"{model_name}: RMSLE = {rmsle}")


best_model_name = min(results, key=results.get)
best_model = grid_search.best_estimator_
print(f"Best model: {best_model_name}")
best_model.fit(X_train, Y_train)




In [None]:
predictions =best_model.predict(X_test_scaled)
res_list = []
for x in predictions:
    if x<0:
      res_list.append(0)
    else:
      res_list.append(x)
predictions = np.array(res_list)
predictions_df = pd.DataFrame({'datetime': test['datetime'], 'count': predictions})
predictions_df.to_csv('predictions.csv', index=False)
