In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import binom

In [None]:
! gdown https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/001/428/original/bike_sharing.csv

In [None]:
df = pd.read_csv('/content/bike_sharing.csv')

In [None]:
#df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

b. identifying missing values

In [None]:
df.isnull().sum()

* *there is no missing value in the data set*

identifying dublicate values

In [None]:
df.duplicated().sum()

* *there is no dublicate values in the data set*

d. Analyzing the distribution of Numerical & Categorical variables.

In [None]:
# Distribution of numerical variables
numerical_cols = df.select_dtypes(include=[np.number]).columns
print("\nNumerical columns:", numerical_cols)

df[numerical_cols].hist(bins=15, figsize=(15, 10), layout=(4, 3))
plt.tight_layout()
plt.show()

In [None]:
# Distribution of categorical variables
categorical_cols = df.select_dtypes(exclude=[np.number]).columns
print("\nCategorical columns:", categorical_cols)

for col in categorical_cols:
    plt.figure(figsize=(10, 5))
    sns.countplot(x=col, data=df)
    plt.title(f'Distribution of {col}')
    plt.show()

e. Checking for Outliers and Handling Them Appropriately

In [None]:
for col in numerical_cols:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

#clipping them to the 1st and 99th percentile
for col in numerical_cols:
    q1 = df[col].quantile(0.01)
    q99 = df[col].quantile(0.99)
    df[col] = np.clip(df[col], q1, q99)

for col in numerical_cols:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col} after clipping')
    plt.show()


2. Establishing Relationship Between Dependent and Independent Variables

In [None]:
plt.figure(figsize=(12, 8))

# Select only numerical columns
numerical_df = df.select_dtypes(include=[np.number])
correlation_matrix = numerical_df.corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

3. Checking if theres  any significant difference between the no. of bike rides on Weekdays
and Weekends?

In [None]:
from scipy.stats import ttest_ind

df['datetime'] = pd.to_datetime(df['datetime'])


df['day_of_week'] = df['datetime'].dt.day_name()


df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)

weekday_rentals = df[df['is_weekend'] == 0]['count']
weekend_rentals = df[df['is_weekend'] == 1]['count']


t_stat, p_value = ttest_ind(weekday_rentals, weekend_rentals)

# Set significance level
alpha = 0.05

print(f"T-statistic: {t_stat}, P-value: {p_value}")

if p_value <= alpha:
    print("Reject the null hypothesis: There is a significant difference between the number of bike rides on weekdays and weekends.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the number of bike rides on weekdays and weekends.")


4. Demand for Bicycles Based on Weather Conditions

In [None]:
from scipy.stats import f_oneway

# Assuming 'weather' column indicates weather conditions and 'COUNT' indicates the number of rentals
weather_conditions = df['weather'].unique()
weather_groups = [df[df['weather'] == condition]['count'] for condition in weather_conditions]

# One-way ANOVA test
f_stat, p_value = f_oneway(*weather_groups)

print(f"F-statistic: {f_stat}, P-value: {p_value}")

if p_value <= alpha:
    print("Reject the null hypothesis: There is a significant difference in bike rentals across different weather conditions.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in bike rentals across different weather conditions.")


5. Demand for Bicycles Based on Seasons

In [None]:
season_conditions = df['season'].unique()
season_groups = [df[df['season'] == condition]['count'] for condition in season_conditions]

# One-way ANOVA test
f_stat, p_value = f_oneway(*season_groups)

print(f"F-statistic: {f_stat}, P-value: {p_value}")

if p_value <= alpha:
    print("Reject the null hypothesis: There is a significant difference in bike rentals across different seasons.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in bike rentals across different seasons.")


6. Weather Conditions Across Different Seasons

In [None]:
from scipy.stats import chi2_contingency

contingency_table = pd.crosstab(df['weather'], df['season'])

chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-square statistic: {chi2_stat}, P-value: {p_value}")

if p_value <= alpha:
    print("Reject the null hypothesis: Weather conditions are significantly different during different seasons.")
else:
    print("Fail to reject the null hypothesis: Weather conditions are not significantly different during different seasons.")


Step 1: Exploratory Data Analysis (EDA)

**A. Examine dataset structure, characteristics, and statistical summary.**

**Insights**:

The dataset contains detailed information on bike rentals, including various attributes such as date, weather, season, and rental counts.
The dataset appears to have no missing values or duplicate records, ensuring data quality.

**Recommendations**:

Continue to ensure data quality by regularly checking for missing values and duplicates. This will help in maintaining accurate and reliable data for future analysis.

Implement automated data quality checks in the data pipeline to detect and rectify any anomalies promptly.

**B. Identify missing values and perform imputation using an appropriate method.**

**Insights**:

No missing values were found in the dataset.

**Recommendations**:

Maintain consistent and complete data entry practices to avoid missing values in future datasets.

If missing values do occur, establish a protocol for imputation using appropriate methods to ensure data integrity.

**C. Identify and remove duplicate records.**

**Insights**:
 No duplicate records were found in the dataset.

**Recommendations**:
Implement validation checks to prevent duplicate entries in the system. This ensures the accuracy of usage statistics and demand forecasting.

**D. Analyze the distribution of Numerical & Categorical variables**

**Insights**:

Numerical variables show varying distributions, some of which may be skewed.
Categorical variables have different levels of frequency, indicating varied usage patterns.

**Recommendations**:

* Consider transforming skewed numerical variables to normalize the data. This can improve the performance of predictive models.
* For imbalanced categorical variables, consider strategies like targeted marketing or promotions to balance usage across different categories.

**e. Check for Outliers and deal with them accordingly.**

**Insights**:

Outliers were detected and removed based on the IQR method.

**Recommendations**:

* Investigate the causes of outliers to determine if they indicate data issues or true anomalies. Implement policies to address data entry errors.


**Step 2: Establish a Relationship between the Dependent and Independent Variables**

**Insights**:

The correlation heatmap indicates some highly correlated numerical variables.

**Recommendations**:

* Remove or combine highly correlated variables to prevent multicollinearity, which can skew the results of predictive models.
*  Consider using techniques like Principal Component Analysis (PCA) to reduce the dimensionality of the data while preserving important information.

**Step 3: Significant difference between the number of bike rides on Weekdays and Weekends**

**Insights**:

A significant difference in bike rides between Weekdays and Weekends was found.

**Recommendations**:

* Increase bike availability during weekends to meet higher demand.
* Develop specific marketing strategies for weekdays to boost rentals, such as weekday promotions or partnerships with businesses for commuter benefits.

**Step 4: Demand for bicycles on rent for different Weather conditions**

**Insights**:

The demand for bicycles varies significantly with weather conditions.

**Recommendations**:

* Adjust the fleet size and maintenance schedules based on weather forecasts to ensure availability during favorable conditions.
* Provide weather-appropriate gear (e.g., raincoats, umbrellas) to customers to encourage bike rentals during less favorable weather conditions.

**Step 5: Demand for bicycles on rent for different Seasons**

**Insights**:

The demand for bicycles varies significantly with seasons.

***Recommendations***:

* Plan for increased maintenance and resource allocation during peak seasons to ensure that the fleet is in optimal condition.
*  Launch seasonal marketing campaigns to capitalize on higher demand periods, such as summer or spring.

**Step 6: Weather conditions during different Seasons**

**Insights**:

Weather conditions significantly differ across seasons.

*Recommendations*:

* Use historical weather patterns to predict bike rental demand and adjust operations accordingly.
* Optimize inventory levels based on expected seasonal weather conditions to avoid under- or over-supply of bikes.


**SUMMARY AND RECOMMENDATIONS**

Based on the analysis, Yulu can enhance its bike-sharing service by implementing the following strategies:

* Maintain data quality and consistency to ensure reliable analysis.
Address data anomalies and manage outliers to improve the accuracy of
insights.
* Optimize bike availability and maintenance schedules based on demand patterns influenced by day of the week, weather, and seasons.
* Develop targeted marketing campaigns to boost rentals during low-demand periods and capitalize on high-demand times.
* Use advanced analytics techniques to manage multicollinearity and dimensionality in the data for better predictive modeling.

By leveraging these
Insights and Recommendations, Yulu can improve its operational efficiency, customer satisfaction, and overall business performance.







                                                               
                                                              **THANKYOU**