In [1]:
# Task 1: Import and Clean Data

In [2]:
# Import relevant Python libraries for data manipulation and numerical operations:
#pandas, numpy, matplotlib, seaborn
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [3]:
#Load the dataset into a Pandas DataFrame from a CSV file. Filename: FloridaBikeRentals.csv
import pandas as pd
df = pd.read_csv('FloridaBikeRentals.csv')
df.head(8)

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature,Humidity,Wind speed,Visibility,Dew point temperature,Solar Radiation,Rainfall,Snowfall,Seasons,Holiday,Functioning Day
0,01-12-2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01-12-2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01-12-2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01-12-2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01-12-2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes
5,01-12-2017,100,5,-6.4,37,1.5,2000,-18.7,0.0,0.0,0.0,Winter,No Holiday,Yes
6,01-12-2017,181,6,-6.6,35,1.3,2000,-19.5,0.0,0.0,0.0,Winter,No Holiday,Yes
7,01-12-2017,460,7,-7.4,38,0.9,2000,-19.3,0.0,0.0,0.0,Winter,No Holiday,Yes


In [4]:
# Inspect the data:
# View the first few rows, shape, column names, and data types
df.info()
df.shape
df.columns
# Identify missing values and inconsistencies
df.isnull().sum()
df.isna().any()
# Handle missing values and data inconsistencies:
# Report missing values and suggest appropriate handling techniques (e.g., fill with mean, drop rows, etc.)
# Check for duplicate records and remove them if necessary
duplicates = df.duplicated()
print(duplicates)
# Comment on data types and suggest optimizations for memory efficiency. Focus on columns such as Temperature, Humidity(%), Wind speed (m/s)
# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
# Convert 'Hour' to uint8 to save memory
df['Hour'] = df['Hour'].astype('uint8')
# numeric columns to save memory changing from 64 to as low as data range would allow
df['Temperature'] = df['Temperature'].astype('float32')
df['Wind speed'] = df['Wind speed'].astype('float32')
df['Humidity'] = df['Humidity'].astype('uint8') 
df['Visibility'] = df['Visibility'].astype('int16') 
df['Rented Bike Count'] = df['Rented Bike Count'].astype('int16')
df['Dew point temperature'] = df['Dew point temperature'].astype('float32')
df['Solar Radiation'] = df['Solar Radiation'].astype('float32')
df['Rainfall'] = df['Rainfall'].astype('float32')
df['Snowfall'] = df['Snowfall'].astype('float32')
## NEED TO RESEARCH https://pandas.pydata.org/docs/reference/api/pandas.to_numeric.html - do them automatically in best format
# covert category columns to category from object to save memory
df['Seasons'] = df['Seasons'].astype('category')
df['Holiday'] = df['Holiday'].astype('category')
df['Functioning Day'] = df['Functioning Day'].astype('category')

df.head()
df.info() # run this again, make sure it's good and look at memory usage
## change df name to cleaned_data
# Assume df is your cleaned DataFrame


# Export the cleaned data to JSON format as bike_rental_cleaned.json
df.to_json("bike_rentals_cleaned.json", orient="records", date_format="iso")

# Write a short report summarizing observations about the data


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Date                   8760 non-null   object 
 1   Rented Bike Count      8760 non-null   int64  
 2   Hour                   8760 non-null   int64  
 3   Temperature            8760 non-null   float64
 4   Humidity               8760 non-null   int64  
 5   Wind speed             8760 non-null   float64
 6   Visibility             8760 non-null   int64  
 7   Dew point temperature  8760 non-null   float64
 8   Solar Radiation        8760 non-null   float64
 9   Rainfall               8760 non-null   float64
 10  Snowfall               8760 non-null   float64
 11  Seasons                8760 non-null   object 
 12  Holiday                8760 non-null   object 
 13  Functioning Day        8760 non-null   object 
dtypes: float64(6), int64(4), object(4)
memory usage: 958.3+ 

In [5]:
# Task 2: Data Processing and Statistical Analysis
raw_df = df.copy() # make a copy of the dataset so that I can show the transformations
# Perform transformations:
# Multiply Temperature by 10 for standardization
df['Temperature'] = df['Temperature'] * 10
print(df['Temperature'])

# Scale Visibility to a range between 0 and 1 using MinMax scaling
df['Visibility'] = (df['Visibility'] - df['Visibility'].min()) / (df['Visibility'].max() - df['Visibility'].min())
print(df['Visibility']) 
# this will show that the originally observed data for visibility has been scaled so that the highest value is 1 and lowest value is 0.  
# so now the data is normalized and could make for better computations




0      -52.0
1      -55.0
2      -60.0
3      -62.0
4      -60.0
        ... 
8755    42.0
8756    34.0
8757    26.0
8758    21.0
8759    19.0
Name: Temperature, Length: 8760, dtype: float32
0       1.000000
1       1.000000
2       1.000000
3       1.000000
4       1.000000
          ...   
8755    0.946275
8756    1.000000
8757    0.983781
8758    0.928535
8759    0.953877
Name: Visibility, Length: 8760, dtype: float64


In [6]:
# Conduct basic statistical analysis:
# Descriptive statistics for key columns in the transformed dataset


print("Transformed Data Statistics:")
print(df[['Temperature', 'Humidity', 'Rented Bike Count']].describe())

# Descriptive statistics for key columns in the raw dataset
print("\nRaw Data Statistics:")
print(raw_df[['Temperature', 'Humidity', 'Rented Bike Count']].describe())
# Below output shows the data before and after Temperature was transformed because I saved the copy of the pre-transformed data to raw_df.  
# I did not transform the data for the Humidity or Rented Bike Count, so that data is the same
# I hope this captured the intent of this step :)


Transformed Data Statistics:
       Temperature     Humidity  Rented Bike Count
count  8760.000000  8760.000000        8760.000000
mean    128.829224    58.226256         704.602055
std     119.448181    20.362413         644.997468
min    -178.000000     0.000000           0.000000
25%      35.000000    42.000000         191.000000
50%     137.000000    57.000000         504.500000
75%     225.000000    74.000000        1065.250000
max     394.000000    98.000000        3556.000000

Raw Data Statistics:
       Temperature     Humidity  Rented Bike Count
count  8760.000000  8760.000000        8760.000000
mean     12.882922    58.226256         704.602055
std      11.944813    20.362413         644.997468
min     -17.799999     0.000000           0.000000
25%       3.500000    42.000000         191.000000
50%      13.700000    57.000000         504.500000
75%      22.500000    74.000000        1065.250000
max      39.400002    98.000000        3556.000000


In [7]:
# Identify columns that are not suitable for statistical analysis and recommend possible datatype changes
#
#
#
# Export the processed data to a CSV file named bike_rental_processed.csv
df.to_csv('bike_rental_processed.csv', index=False)

# Prepare a short report on statistical observations and insights
#
#
#
#

In [8]:
# Task 3: Data Analysis with Pandas

# Identify categorical and numerical variables # Focus on columns such as Seasons, Holiday, and Functioning Day
import numpy as np
import pandas as pd
# for this dataset, I consider the following columns as categorical:
categorical_cols = ['Seasons', 'Holiday', 'Functioning Day']
# the rest of the columns are numerical except date
numeric_cols = [col for col in df.columns if col not in categorical_cols + ['Date']]
print("Categorical Columns:", categorical_cols)
print("Numeric Columns:", numeric_cols)

Categorical Columns: ['Seasons', 'Holiday', 'Functioning Day']
Numeric Columns: ['Rented Bike Count', 'Hour', 'Temperature', 'Humidity', 'Wind speed', 'Visibility', 'Dew point temperature', 'Solar Radiation', 'Rainfall', 'Snowfall']


In [9]:
# Perform pivoting operations on the dataset based on categorical columns: Group by Seasons and calculate the average rented bike count
# Group by seasons and calculate the average rented bike count
avg_bike_by_season = df.groupby('Seasons', observed=True)['Rented Bike Count'].mean()
print("Average bike rentals per season:")
print(avg_bike_by_season)

# Analyze trends across Holiday and Functioning Day
avg_bike_by_holiday = df.groupby('Holiday', observed=True)['Rented Bike Count'].mean()
print("Average bike rentals by whether it was a holiday:")
print(avg_bike_by_holiday)

avg_bike_by_funct = df.groupby('Functioning Day', observed = True)['Rented Bike Count'].mean()
print("Average bike rentals by Functioning Day:")
print(avg_bike_by_funct)


Average bike rentals per season:
Seasons
Autumn     819.597985
Spring     730.031250
Summer    1034.073370
Winter     225.541204
Name: Rented Bike Count, dtype: float64
Average bike rentals by whether it was a holiday:
Holiday
Holiday       499.756944
No Holiday    715.228026
Name: Rented Bike Count, dtype: float64
Average bike rentals by Functioning Day:
Functioning Day
No       0.000000
Yes    729.156999
Name: Rented Bike Count, dtype: float64


In [10]:
# Create distribution tables:# Temperatureand Rented Bike Count distribution by Hour, Seasons and Rented Bike Count distribution
pivot_temp = pd.pivot_table(df, index='Hour', columns='Seasons', values='Temperature', aggfunc='mean', observed=True)
pivot_rented = pd.pivot_table(df, index='Hour', columns='Seasons', values='Rented Bike Count', aggfunc='mean', observed=True)

print("\nTemperature Distribution by Hour and Seasons (Mean):")
print(pivot_temp)
print("\nRented Bike Count Distribution by Hour and Seasons (Mean):")
print(pivot_rented)
# Encode categorical variables and save data as "Rental_Bike_Data_Dummy.csv"
df_dummies = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Export the encoded DataFrame to a CSV file.
df_dummies.to_csv('Rental_Bike_Data_Dummy.csv', index=False)
print("\nEncoded dataset saved as 'Rental_Bike_Data_Dummy.csv'")


Temperature Distribution by Hour and Seasons (Mean):
Seasons      Autumn      Spring      Summer     Winter
Hour                                                  
0        126.296700  110.641304  247.521744 -36.099998
1        122.120880  106.782608  243.858688 -38.911110
2        118.087914  103.260872  240.978256 -41.744446
3        114.186813  100.119568  238.010864 -43.633335
4        111.186813   97.250000  235.206528 -45.644444
5        108.318680   94.782608  232.630432 -48.033333
6        105.659340   92.858696  231.152176 -50.311111
7        104.120880   94.065216  234.413040 -52.488888
8        108.318680  103.369568  245.358688 -53.266666
9        123.516487  117.858696  258.271729 -48.022221
10       142.373627  133.184784  270.608704 -33.144444
11       158.626373  147.641312  281.228271 -18.433332
12       170.450546  159.326080  291.065216  -5.666667
13       178.615387  168.293472  297.836945   2.166667
14       184.406601  173.913040  302.347839   8.711111
15       18

In [11]:
# Task 4: Data Visualization

# Import visualization libraries (matplotlib, seaborn)
# Select appropriate visualization techniques for the data:
# Bar plot for average rentals by Seasons
# Line plot showing hourly rentals throughout the day
# Heatmap showing correlation among numerical variables
# Box plot to identify outliers in Temperature and Rented Bike Count
# Record observations and insights from visualizations
# Save plots and observations for reporting purposes