In [1]:
import os
import polars as pl

# List of folder paths
folder_paths = [
    r"C:\Users\Sowjanya\OneDrive\Desktop\Capstone\DATA_2020"
]

# Create an empty list to store DataFrames
dataframes = []

# Loop through each folder
for folder_path in folder_paths:
    # Get a list of all CSV files in the folder
    file_list = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    # Read each file and append to the list
    for file in file_list:
        file_path = os.path.join(folder_path, file)
        df = pl.read_csv(file_path, infer_schema_length=1000)  # Increase schema inference
        dataframes.append(df.with_columns([
            pl.col(col).cast(pl.Utf8) for col in df.columns  # Convert all columns to string
        ]))

# Concatenate all DataFrames into one
df_final = pl.concat(dataframes, how="vertical_relaxed")  # Allow mismatched schemas

# Display the first few rows
print(df_final.head())

shape: (5, 13)
┌──────────────────┬───────────────┬────────────┬────────────┬───┬───────────┬─────────┬──────────┬───────────────┐
│ ride_id          ┆ rideable_type ┆ started_at ┆ ended_at   ┆ … ┆ start_lng ┆ end_lat ┆ end_lng  ┆ member_casual │
│ ---              ┆ ---           ┆ ---        ┆ ---        ┆   ┆ ---       ┆ ---     ┆ ---      ┆ ---           │
│ str              ┆ str           ┆ str        ┆ str        ┆   ┆ str       ┆ str     ┆ str      ┆ str           │
╞══════════════════╪═══════════════╪════════════╪════════════╪═══╪═══════════╪═════════╪══════════╪═══════════════╡
│ A847FADBBC638E45 ┆ docked_bike   ┆ 2020-04-26 ┆ 2020-04-26 ┆ … ┆ -87.661   ┆ 41.9322 ┆ -87.6586 ┆ member        │
│                  ┆               ┆ 17:45:14   ┆ 18:12:03   ┆   ┆           ┆         ┆          ┆               │
│ 5405B80E996FF60D ┆ docked_bike   ┆ 2020-04-17 ┆ 2020-04-17 ┆ … ┆ -87.7154  ┆ 41.9306 ┆ -87.7238 ┆ member        │
│                  ┆               ┆ 17:08:54   ┆ 17:17:0

In [3]:
df = df_final.clone()

In [5]:
df

ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
str,str,str,str,str,str,str,str,str,str,str,str,str
"""A847FADBBC638E45""","""docked_bike""","""2020-04-26 17:45:14""","""2020-04-26 18:12:03""","""Eckhart Park""","""86""","""Lincoln Ave & Diversey Pkwy""","""152""","""41.8964""","""-87.661""","""41.9322""","""-87.6586""","""member"""
"""5405B80E996FF60D""","""docked_bike""","""2020-04-17 17:08:54""","""2020-04-17 17:17:03""","""Drake Ave & Fullerton Ave""","""503""","""Kosciuszko Park""","""499""","""41.9244""","""-87.7154""","""41.9306""","""-87.7238""","""member"""
"""5DD24A79A4E006F4""","""docked_bike""","""2020-04-01 17:54:13""","""2020-04-01 18:08:36""","""McClurg Ct & Erie St""","""142""","""Indiana Ave & Roosevelt Rd""","""255""","""41.8945""","""-87.6179""","""41.8679""","""-87.623""","""member"""
"""2A59BBDF5CDBA725""","""docked_bike""","""2020-04-07 12:50:19""","""2020-04-07 13:02:31""","""California Ave & Division St""","""216""","""Wood St & Augusta Blvd""","""657""","""41.903""","""-87.6975""","""41.8992""","""-87.6722""","""member"""
"""27AD306C119C6158""","""docked_bike""","""2020-04-18 10:22:59""","""2020-04-18 11:15:54""","""Rush St & Hubbard St""","""125""","""Sheridan Rd & Lawrence Ave""","""323""","""41.8902""","""-87.6262""","""41.9695""","""-87.6547""","""casual"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""4631EE956BCEA738""","""classic_bike""","""2020-12-19 13:59:33""","""2020-12-19 14:03:21""","""Rhodes Ave & 32nd St""","""13215""","""Indiana Ave & 31st St""","""TA1308000036""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""casual"""
"""D17CBEDEA8DBDFF6""","""electric_bike""","""2020-12-20 15:48:41""","""2020-12-20 15:52:14""","""Rhodes Ave & 32nd St""","""13215""","""Indiana Ave & 31st St""","""TA1308000036""","""41.836723166666665""","""-87.61336483333334""","""41.838722833333335""","""-87.6218535""","""member"""
"""447A6C67E9AF962E""","""docked_bike""","""2020-12-02 16:59:58""","""2020-12-02 17:08:28""","""Rhodes Ave & 32nd St""","""13215""","""Indiana Ave & 31st St""","""TA1308000036""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""member"""
"""F558C17E95751C62""","""electric_bike""","""2020-12-20 18:06:52""","""2020-12-20 18:09:53""","""Rhodes Ave & 32nd St""","""13215""","""Indiana Ave & 31st St""","""TA1308000036""","""41.836734166666666""","""-87.61339383333333""","""41.8387495""","""-87.621862""","""member"""


In [7]:
# Step 1: Count missing values in each column
missing_values = df.null_count()
print(missing_values)

shape: (1, 13)
┌─────────┬───────────────┬────────────┬──────────┬───┬───────────┬─────────┬─────────┬───────────────┐
│ ride_id ┆ rideable_type ┆ started_at ┆ ended_at ┆ … ┆ start_lng ┆ end_lat ┆ end_lng ┆ member_casual │
│ ---     ┆ ---           ┆ ---        ┆ ---      ┆   ┆ ---       ┆ ---     ┆ ---     ┆ ---           │
│ u32     ┆ u32           ┆ u32        ┆ u32      ┆   ┆ u32       ┆ u32     ┆ u32     ┆ u32           │
╞═════════╪═══════════════╪════════════╪══════════╪═══╪═══════════╪═════════╪═════════╪═══════════════╡
│ 0       ┆ 0             ┆ 0          ┆ 0        ┆ … ┆ 0         ┆ 4254    ┆ 4254    ┆ 0             │
└─────────┴───────────────┴────────────┴──────────┴───┴───────────┴─────────┴─────────┴───────────────┘


In [9]:
# Step 2: Identify and store records where both 'end_lat' and 'end_lng' are null
missing_end_lat_lng = df.filter(pl.col("end_lat").is_null() & pl.col("end_lng").is_null()).clone()

# Step 3: Remove these records from the original DataFrame
df_cleaned = df.drop_nulls(subset=["end_lat", "end_lng"])

# Display results
print(f"Number of removed records: {missing_end_lat_lng.shape[0]}")
print("\nRecords removed (stored separately):")
print(missing_end_lat_lng)

print("\nUpdated DataFrame after removal:")
print(df_cleaned)

Number of removed records: 4254

Records removed (stored separately):
shape: (4_254, 13)
┌───────────────┬───────────────┬───────────────┬───────────────┬───┬───────────────┬─────────┬─────────┬──────────────┐
│ ride_id       ┆ rideable_type ┆ started_at    ┆ ended_at      ┆ … ┆ start_lng     ┆ end_lat ┆ end_lng ┆ member_casua │
│ ---           ┆ ---           ┆ ---           ┆ ---           ┆   ┆ ---           ┆ ---     ┆ ---     ┆ l            │
│ str           ┆ str           ┆ str           ┆ str           ┆   ┆ str           ┆ str     ┆ str     ┆ ---          │
│               ┆               ┆               ┆               ┆   ┆               ┆         ┆         ┆ str          │
╞═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══╪═══════════════╪═════════╪═════════╪══════════════╡
│ 5E2BD03BCA180 ┆ docked_bike   ┆ 2020-04-07    ┆ 2020-04-07    ┆ … ┆ -87.6347      ┆ null    ┆ null    ┆ member       │
│ FBA           ┆               ┆ 11:53:08      ┆ 12:28:35      

In [104]:
df_cleaned = df_cleaned.with_columns(
    pl.col("started_at").str.split(" ").alias("split_col")
).with_columns(
    pl.col("split_col").list.get(0).alias("start_date"),
    pl.col("split_col").list.get(1).alias("start_time")
).drop("split_col")  # Optional: Remove the temporary column

In [105]:
# Split 'started_at' into 'start_date' and 'start_time'
df_cleaned = df_cleaned.with_columns(
    pl.col("started_at").str.split(" ").alias("split_start")
).with_columns(
    pl.col("split_start").list.get(0).alias("start_date"),
    pl.col("split_start").list.get(1).alias("start_time")
).drop("split_start")  # Optional cleanup

# Remove milliseconds from 'start_time'
df_cleaned = df_cleaned.with_columns(
    pl.col("start_time").str.split(".").list.get(0)
)

# Convert 'start_date' and 'start_time' to datetime
df_cleaned = df_cleaned.with_columns(
    (pl.col("start_date") + " " + pl.col("start_time")).str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S").alias("started_at")
)

# Split 'ended_at' into 'end_date' and 'end_time'
df_cleaned = df_cleaned.with_columns(
    pl.col("ended_at").str.split(" ").alias("split_end")
).with_columns(
    pl.col("split_end").list.get(0).alias("end_date"),
    pl.col("split_end").list.get(1).alias("end_time")
).drop("split_end")  # Optional cleanup

# Remove milliseconds from 'end_time'
df_cleaned = df_cleaned.with_columns(
    pl.col("end_time").str.split(".").list.get(0)
)


In [110]:
# Convert 'start_date' and 'start_time' to datetime
df_cleaned = df_cleaned.with_columns(
    (pl.col("end_date") + " " + pl.col("end_time")).str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S").alias("ended_at")
)

In [126]:
df_cleaned

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration
str,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str,duration[μs]
"""A847FADBBC638E45""","""docked_bike""",2020-04-26 17:45:14,2020-04-26 18:12:03,"""Eckhart Park""","""Lincoln Ave & Diversey Pkwy""","""41.8964""","""-87.661""","""41.9322""","""-87.6586""","""member""","""2020-04-26""","""17:45:14""","""2020-04-26""","""18:12:03""",26m 49s
"""5405B80E996FF60D""","""docked_bike""",2020-04-17 17:08:54,2020-04-17 17:17:03,"""Drake Ave & Fullerton Ave""","""Kosciuszko Park""","""41.9244""","""-87.7154""","""41.9306""","""-87.7238""","""member""","""2020-04-17""","""17:08:54""","""2020-04-17""","""17:17:03""",8m 9s
"""5DD24A79A4E006F4""","""docked_bike""",2020-04-01 17:54:13,2020-04-01 18:08:36,"""McClurg Ct & Erie St""","""Indiana Ave & Roosevelt Rd""","""41.8945""","""-87.6179""","""41.8679""","""-87.623""","""member""","""2020-04-01""","""17:54:13""","""2020-04-01""","""18:08:36""",14m 23s
"""2A59BBDF5CDBA725""","""docked_bike""",2020-04-07 12:50:19,2020-04-07 13:02:31,"""California Ave & Division St""","""Wood St & Augusta Blvd""","""41.903""","""-87.6975""","""41.8992""","""-87.6722""","""member""","""2020-04-07""","""12:50:19""","""2020-04-07""","""13:02:31""",12m 12s
"""27AD306C119C6158""","""docked_bike""",2020-04-18 10:22:59,2020-04-18 11:15:54,"""Rush St & Hubbard St""","""Sheridan Rd & Lawrence Ave""","""41.8902""","""-87.6262""","""41.9695""","""-87.6547""","""casual""","""2020-04-18""","""10:22:59""","""2020-04-18""","""11:15:54""",52m 55s
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4631EE956BCEA738""","""classic_bike""",2020-12-19 13:59:33,2020-12-19 14:03:21,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""casual""","""2020-12-19""","""13:59:33""","""2020-12-19""","""14:03:21""",3m 48s
"""D17CBEDEA8DBDFF6""","""electric_bike""",2020-12-20 15:48:41,2020-12-20 15:52:14,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836723166666665""","""-87.61336483333334""","""41.838722833333335""","""-87.6218535""","""member""","""2020-12-20""","""15:48:41""","""2020-12-20""","""15:52:14""",3m 33s
"""447A6C67E9AF962E""","""docked_bike""",2020-12-02 16:59:58,2020-12-02 17:08:28,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""member""","""2020-12-02""","""16:59:58""","""2020-12-02""","""17:08:28""",8m 30s
"""F558C17E95751C62""","""electric_bike""",2020-12-20 18:06:52,2020-12-20 18:09:53,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836734166666666""","""-87.61339383333333""","""41.8387495""","""-87.621862""","""member""","""2020-12-20""","""18:06:52""","""2020-12-20""","""18:09:53""",3m 1s


In [114]:
df_cleaned = df_cleaned.drop(["start_station_id", "end_station_id"])


In [115]:
df_cleaned = df_cleaned.with_columns(
    (pl.col("ended_at") - pl.col("started_at")).alias("duration")
)


In [130]:
df_filtered = df_cleaned.filter(pl.col("duration") <= pl.duration(minutes=2))

print(df_filtered)


shape: (83_268, 16)
┌───────────────┬───────────────┬───────────────┬──────────────┬───┬────────────┬────────────┬──────────┬──────────────┐
│ ride_id       ┆ rideable_type ┆ started_at    ┆ ended_at     ┆ … ┆ start_time ┆ end_date   ┆ end_time ┆ duration     │
│ ---           ┆ ---           ┆ ---           ┆ ---          ┆   ┆ ---        ┆ ---        ┆ ---      ┆ ---          │
│ str           ┆ str           ┆ datetime[μs]  ┆ datetime[μs] ┆   ┆ str        ┆ str        ┆ str      ┆ duration[μs] │
╞═══════════════╪═══════════════╪═══════════════╪══════════════╪═══╪════════════╪════════════╪══════════╪══════════════╡
│ 9E350A69C5FED ┆ docked_bike   ┆ 2020-04-08    ┆ 2020-04-08   ┆ … ┆ 15:01:51   ┆ 2020-04-08 ┆ 15:03:20 ┆ 1m 29s       │
│ EF3           ┆               ┆ 15:01:51      ┆ 15:03:20     ┆   ┆            ┆            ┆          ┆              │
│ 27E8A3FD802C4 ┆ docked_bike   ┆ 2020-04-21    ┆ 2020-04-21   ┆ … ┆ 16:25:19   ┆ 2020-04-21 ┆ 16:26:41 ┆ 1m 22s       │
│ C08       

In [132]:


df_filtered = df_cleaned.filter(pl.col("duration") <= pl.duration(minutes=2))

print(df_filtered)



shape: (83_268, 16)
┌───────────────┬───────────────┬───────────────┬──────────────┬───┬────────────┬────────────┬──────────┬──────────────┐
│ ride_id       ┆ rideable_type ┆ started_at    ┆ ended_at     ┆ … ┆ start_time ┆ end_date   ┆ end_time ┆ duration     │
│ ---           ┆ ---           ┆ ---           ┆ ---          ┆   ┆ ---        ┆ ---        ┆ ---      ┆ ---          │
│ str           ┆ str           ┆ datetime[μs]  ┆ datetime[μs] ┆   ┆ str        ┆ str        ┆ str      ┆ duration[μs] │
╞═══════════════╪═══════════════╪═══════════════╪══════════════╪═══╪════════════╪════════════╪══════════╪══════════════╡
│ 9E350A69C5FED ┆ docked_bike   ┆ 2020-04-08    ┆ 2020-04-08   ┆ … ┆ 15:01:51   ┆ 2020-04-08 ┆ 15:03:20 ┆ 1m 29s       │
│ EF3           ┆               ┆ 15:01:51      ┆ 15:03:20     ┆   ┆            ┆            ┆          ┆              │
│ 27E8A3FD802C4 ┆ docked_bike   ┆ 2020-04-21    ┆ 2020-04-21   ┆ … ┆ 16:25:19   ┆ 2020-04-21 ┆ 16:26:41 ┆ 1m 22s       │
│ C08       

In [117]:
df_filtered = df_cleaned.with_columns(
    pl.when(pl.col("duration") <= pl.duration(minutes=2))
    .then(pl.lit("Suspicious"))
    .otherwise(pl.lit("Fair"))
    .alias("ride_validity")
)

print(df_filtered)


shape: (3_110_542, 17)
┌───────────────┬──────────────┬──────────────┬──────────────┬───┬────────────┬──────────┬──────────────┬──────────────┐
│ ride_id       ┆ rideable_typ ┆ started_at   ┆ ended_at     ┆ … ┆ end_date   ┆ end_time ┆ duration     ┆ ride_validit │
│ ---           ┆ e            ┆ ---          ┆ ---          ┆   ┆ ---        ┆ ---      ┆ ---          ┆ y            │
│ str           ┆ ---          ┆ datetime[μs] ┆ datetime[μs] ┆   ┆ str        ┆ str      ┆ duration[μs] ┆ ---          │
│               ┆ str          ┆              ┆              ┆   ┆            ┆          ┆              ┆ str          │
╞═══════════════╪══════════════╪══════════════╪══════════════╪═══╪════════════╪══════════╪══════════════╪══════════════╡
│ A847FADBBC638 ┆ docked_bike  ┆ 2020-04-26   ┆ 2020-04-26   ┆ … ┆ 2020-04-26 ┆ 18:12:03 ┆ 26m 49s      ┆ Fair         │
│ E45           ┆              ┆ 17:45:14     ┆ 18:12:03     ┆   ┆            ┆          ┆              ┆              │
│ 5405B80

In [124]:
df_filtered


ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity
str,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str,duration[μs],str
"""A847FADBBC638E45""","""docked_bike""",2020-04-26 17:45:14,2020-04-26 18:12:03,"""Eckhart Park""","""Lincoln Ave & Diversey Pkwy""","""41.8964""","""-87.661""","""41.9322""","""-87.6586""","""member""","""2020-04-26""","""17:45:14""","""2020-04-26""","""18:12:03""",26m 49s,"""Fair"""
"""5405B80E996FF60D""","""docked_bike""",2020-04-17 17:08:54,2020-04-17 17:17:03,"""Drake Ave & Fullerton Ave""","""Kosciuszko Park""","""41.9244""","""-87.7154""","""41.9306""","""-87.7238""","""member""","""2020-04-17""","""17:08:54""","""2020-04-17""","""17:17:03""",8m 9s,"""Fair"""
"""5DD24A79A4E006F4""","""docked_bike""",2020-04-01 17:54:13,2020-04-01 18:08:36,"""McClurg Ct & Erie St""","""Indiana Ave & Roosevelt Rd""","""41.8945""","""-87.6179""","""41.8679""","""-87.623""","""member""","""2020-04-01""","""17:54:13""","""2020-04-01""","""18:08:36""",14m 23s,"""Fair"""
"""2A59BBDF5CDBA725""","""docked_bike""",2020-04-07 12:50:19,2020-04-07 13:02:31,"""California Ave & Division St""","""Wood St & Augusta Blvd""","""41.903""","""-87.6975""","""41.8992""","""-87.6722""","""member""","""2020-04-07""","""12:50:19""","""2020-04-07""","""13:02:31""",12m 12s,"""Fair"""
"""27AD306C119C6158""","""docked_bike""",2020-04-18 10:22:59,2020-04-18 11:15:54,"""Rush St & Hubbard St""","""Sheridan Rd & Lawrence Ave""","""41.8902""","""-87.6262""","""41.9695""","""-87.6547""","""casual""","""2020-04-18""","""10:22:59""","""2020-04-18""","""11:15:54""",52m 55s,"""Fair"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4631EE956BCEA738""","""classic_bike""",2020-12-19 13:59:33,2020-12-19 14:03:21,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""casual""","""2020-12-19""","""13:59:33""","""2020-12-19""","""14:03:21""",3m 48s,"""Fair"""
"""D17CBEDEA8DBDFF6""","""electric_bike""",2020-12-20 15:48:41,2020-12-20 15:52:14,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836723166666665""","""-87.61336483333334""","""41.838722833333335""","""-87.6218535""","""member""","""2020-12-20""","""15:48:41""","""2020-12-20""","""15:52:14""",3m 33s,"""Fair"""
"""447A6C67E9AF962E""","""docked_bike""",2020-12-02 16:59:58,2020-12-02 17:08:28,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""member""","""2020-12-02""","""16:59:58""","""2020-12-02""","""17:08:28""",8m 30s,"""Fair"""
"""F558C17E95751C62""","""electric_bike""",2020-12-20 18:06:52,2020-12-20 18:09:53,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836734166666666""","""-87.61339383333333""","""41.8387495""","""-87.621862""","""member""","""2020-12-20""","""18:06:52""","""2020-12-20""","""18:09:53""",3m 1s,"""Fair"""


In [120]:
df_cleaned = df_cleaned.with_columns([
    df_cleaned["started_at"].dt.truncate("1s").alias("started_at"),
    df_cleaned["ended_at"].dt.truncate("1s").alias("ended_at")
])

In [29]:
df = df_filtered.clone()

In [31]:
df

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity
str,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str,duration[μs],str
"""A847FADBBC638E45""","""docked_bike""",2020-04-26 17:45:14,2020-04-26 18:12:03,"""Eckhart Park""","""Lincoln Ave & Diversey Pkwy""","""41.8964""","""-87.661""","""41.9322""","""-87.6586""","""member""","""2020-04-26""","""17:45:14""","""2020-04-26""","""18:12:03""",26m 49s,"""Fair"""
"""5405B80E996FF60D""","""docked_bike""",2020-04-17 17:08:54,2020-04-17 17:17:03,"""Drake Ave & Fullerton Ave""","""Kosciuszko Park""","""41.9244""","""-87.7154""","""41.9306""","""-87.7238""","""member""","""2020-04-17""","""17:08:54""","""2020-04-17""","""17:17:03""",8m 9s,"""Fair"""
"""5DD24A79A4E006F4""","""docked_bike""",2020-04-01 17:54:13,2020-04-01 18:08:36,"""McClurg Ct & Erie St""","""Indiana Ave & Roosevelt Rd""","""41.8945""","""-87.6179""","""41.8679""","""-87.623""","""member""","""2020-04-01""","""17:54:13""","""2020-04-01""","""18:08:36""",14m 23s,"""Fair"""
"""2A59BBDF5CDBA725""","""docked_bike""",2020-04-07 12:50:19,2020-04-07 13:02:31,"""California Ave & Division St""","""Wood St & Augusta Blvd""","""41.903""","""-87.6975""","""41.8992""","""-87.6722""","""member""","""2020-04-07""","""12:50:19""","""2020-04-07""","""13:02:31""",12m 12s,"""Fair"""
"""27AD306C119C6158""","""docked_bike""",2020-04-18 10:22:59,2020-04-18 11:15:54,"""Rush St & Hubbard St""","""Sheridan Rd & Lawrence Ave""","""41.8902""","""-87.6262""","""41.9695""","""-87.6547""","""casual""","""2020-04-18""","""10:22:59""","""2020-04-18""","""11:15:54""",52m 55s,"""Fair"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4631EE956BCEA738""","""classic_bike""",2020-12-19 13:59:33,2020-12-19 14:03:21,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""casual""","""2020-12-19""","""13:59:33""","""2020-12-19""","""14:03:21""",3m 48s,"""Fair"""
"""D17CBEDEA8DBDFF6""","""electric_bike""",2020-12-20 15:48:41,2020-12-20 15:52:14,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836723166666665""","""-87.61336483333334""","""41.838722833333335""","""-87.6218535""","""member""","""2020-12-20""","""15:48:41""","""2020-12-20""","""15:52:14""",3m 33s,"""Fair"""
"""447A6C67E9AF962E""","""docked_bike""",2020-12-02 16:59:58,2020-12-02 17:08:28,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""member""","""2020-12-02""","""16:59:58""","""2020-12-02""","""17:08:28""",8m 30s,"""Fair"""
"""F558C17E95751C62""","""electric_bike""",2020-12-20 18:06:52,2020-12-20 18:09:53,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836734166666666""","""-87.61339383333333""","""41.8387495""","""-87.621862""","""member""","""2020-12-20""","""18:06:52""","""2020-12-20""","""18:09:53""",3m 1s,"""Fair"""


In [33]:
data_1 = df.clone()

In [35]:
data_1

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity
str,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str,duration[μs],str
"""A847FADBBC638E45""","""docked_bike""",2020-04-26 17:45:14,2020-04-26 18:12:03,"""Eckhart Park""","""Lincoln Ave & Diversey Pkwy""","""41.8964""","""-87.661""","""41.9322""","""-87.6586""","""member""","""2020-04-26""","""17:45:14""","""2020-04-26""","""18:12:03""",26m 49s,"""Fair"""
"""5405B80E996FF60D""","""docked_bike""",2020-04-17 17:08:54,2020-04-17 17:17:03,"""Drake Ave & Fullerton Ave""","""Kosciuszko Park""","""41.9244""","""-87.7154""","""41.9306""","""-87.7238""","""member""","""2020-04-17""","""17:08:54""","""2020-04-17""","""17:17:03""",8m 9s,"""Fair"""
"""5DD24A79A4E006F4""","""docked_bike""",2020-04-01 17:54:13,2020-04-01 18:08:36,"""McClurg Ct & Erie St""","""Indiana Ave & Roosevelt Rd""","""41.8945""","""-87.6179""","""41.8679""","""-87.623""","""member""","""2020-04-01""","""17:54:13""","""2020-04-01""","""18:08:36""",14m 23s,"""Fair"""
"""2A59BBDF5CDBA725""","""docked_bike""",2020-04-07 12:50:19,2020-04-07 13:02:31,"""California Ave & Division St""","""Wood St & Augusta Blvd""","""41.903""","""-87.6975""","""41.8992""","""-87.6722""","""member""","""2020-04-07""","""12:50:19""","""2020-04-07""","""13:02:31""",12m 12s,"""Fair"""
"""27AD306C119C6158""","""docked_bike""",2020-04-18 10:22:59,2020-04-18 11:15:54,"""Rush St & Hubbard St""","""Sheridan Rd & Lawrence Ave""","""41.8902""","""-87.6262""","""41.9695""","""-87.6547""","""casual""","""2020-04-18""","""10:22:59""","""2020-04-18""","""11:15:54""",52m 55s,"""Fair"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4631EE956BCEA738""","""classic_bike""",2020-12-19 13:59:33,2020-12-19 14:03:21,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""casual""","""2020-12-19""","""13:59:33""","""2020-12-19""","""14:03:21""",3m 48s,"""Fair"""
"""D17CBEDEA8DBDFF6""","""electric_bike""",2020-12-20 15:48:41,2020-12-20 15:52:14,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836723166666665""","""-87.61336483333334""","""41.838722833333335""","""-87.6218535""","""member""","""2020-12-20""","""15:48:41""","""2020-12-20""","""15:52:14""",3m 33s,"""Fair"""
"""447A6C67E9AF962E""","""docked_bike""",2020-12-02 16:59:58,2020-12-02 17:08:28,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""member""","""2020-12-02""","""16:59:58""","""2020-12-02""","""17:08:28""",8m 30s,"""Fair"""
"""F558C17E95751C62""","""electric_bike""",2020-12-20 18:06:52,2020-12-20 18:09:53,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836734166666666""","""-87.61339383333333""","""41.8387495""","""-87.621862""","""member""","""2020-12-20""","""18:06:52""","""2020-12-20""","""18:09:53""",3m 1s,"""Fair"""


##  Polars is optimized for performance, we can avoid slow row-wise operations by using a vectorized approach and efficient nearest-neighbor searches.

## Instead of iterating row by row, we can use Scipy's KDTree for fast nearest-neighbor lookups. Here’s how you can implement it in Polars:

In [38]:
import polars as pl
import numpy as np
from scipy.spatial import cKDTree  # Fast nearest-neighbor search

def find_nearest_place_polars(df: pl.DataFrame) -> pl.DataFrame:
    """
    Assigns the nearest place name to rows where 'start_station_name' is missing.
    Uses KDTree for fast nearest-neighbor lookup.
    """

    # Convert to Pandas for KDTree operations (only for lat/lng processing)
    df_pandas = df.to_pandas()

    # Extract valid and missing place data
    valid_places = df_pandas[df_pandas["start_station_name"].notna()]
    missing_places = df_pandas[df_pandas["start_station_name"].isna()]

    # Build KDTree for fast nearest neighbor search
    if not valid_places.empty:
        valid_coords = np.array(list(zip(valid_places["start_lat"], valid_places["start_lng"])))
        tree = cKDTree(valid_coords)

        # Find nearest valid station for each missing station
        if not missing_places.empty:
            missing_coords = np.array(list(zip(missing_places["start_lat"], missing_places["start_lng"])))
            _, nearest_indices = tree.query(missing_coords)

            # Assign nearest station names
            missing_places["start_station_name"] = valid_places.iloc[nearest_indices]["start_station_name"].values

            # Merge back the updated missing places
            df_pandas.loc[df_pandas["start_station_name"].isna(), "start_station_name"] = missing_places["start_station_name"]

    # Convert back to Polars
    return pl.from_pandas(df_pandas)

# Example usage
try:
    df_cleaned = pl.DataFrame(data_1)  # Ensure data_1 is defined
    df_cleaned = find_nearest_place_polars(df_cleaned)
    print(df_cleaned)
except NameError:
    print("Error: 'data_1' is not defined. Make sure your dataset exists.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_places["start_station_name"] = valid_places.iloc[nearest_indices]["start_station_name"].values


shape: (3_110_542, 17)
┌───────────────┬──────────────┬──────────────┬──────────────┬───┬────────────┬──────────┬──────────────┬──────────────┐
│ ride_id       ┆ rideable_typ ┆ started_at   ┆ ended_at     ┆ … ┆ end_date   ┆ end_time ┆ duration     ┆ ride_validit │
│ ---           ┆ e            ┆ ---          ┆ ---          ┆   ┆ ---        ┆ ---      ┆ ---          ┆ y            │
│ str           ┆ ---          ┆ datetime[μs] ┆ datetime[μs] ┆   ┆ str        ┆ str      ┆ duration[μs] ┆ ---          │
│               ┆ str          ┆              ┆              ┆   ┆            ┆          ┆              ┆ str          │
╞═══════════════╪══════════════╪══════════════╪══════════════╪═══╪════════════╪══════════╪══════════════╪══════════════╡
│ A847FADBBC638 ┆ docked_bike  ┆ 2020-04-26   ┆ 2020-04-26   ┆ … ┆ 2020-04-26 ┆ 18:12:03 ┆ 26m 49s      ┆ Fair         │
│ E45           ┆              ┆ 17:45:14     ┆ 18:12:03     ┆   ┆            ┆          ┆              ┆              │
│ 5405B80

In [92]:
# Count missing values in start_station_name
missing_count = df_cleaned["start_station_name"].is_null().sum()
print(f"Missing values in start_station_name: {missing_count}")

Missing values in start_station_name: 0


In [44]:
import polars as pl
import numpy as np
from scipy.spatial import cKDTree  # Fast nearest-neighbor search

def find_nearest_place_polars(df: pl.DataFrame) -> pl.DataFrame:
    """
    Assigns the nearest place name to rows where 'start_station_name' and 'end_station_name' are missing.
    Uses KDTree for fast nearest-neighbor lookup.
    """

    # Convert Polars to Pandas for KDTree operations
    df_pandas = df.to_pandas()

    def fill_missing_station(df, station_col, lat_col, lng_col):
        """
        Helper function to fill missing station names based on nearest valid locations.
        """
        valid_places = df[df[station_col].notna()]
        missing_places = df[df[station_col].isna()]

        # Build KDTree for fast nearest neighbor search
        if not valid_places.empty:
            valid_coords = np.array(list(zip(valid_places[lat_col], valid_places[lng_col])))
            tree = cKDTree(valid_coords)

            # Find nearest valid station for each missing station
            if not missing_places.empty:
                missing_coords = np.array(list(zip(missing_places[lat_col], missing_places[lng_col])))
                _, nearest_indices = tree.query(missing_coords)

                # Assign nearest station names
                missing_places[station_col] = valid_places.iloc[nearest_indices][station_col].values

                # Merge back the updated missing places
                df.loc[df[station_col].isna(), station_col] = missing_places[station_col]

    # Fill missing values for start and end stations
    fill_missing_station(df_pandas, "start_station_name", "start_lat", "start_lng")
    fill_missing_station(df_pandas, "end_station_name", "end_lat", "end_lng")

    # Convert back to Polars
    return pl.from_pandas(df_pandas)

# Example usage
try:
    df_cleaned = pl.DataFrame(data_1)  # Ensure data_1 is defined
    df_cleaned = find_nearest_place_polars(df_cleaned)
    print(df_cleaned)
except NameError:
    print("Error: 'data_1' is not defined. Make sure your dataset exists.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_places[station_col] = valid_places.iloc[nearest_indices][station_col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_places[station_col] = valid_places.iloc[nearest_indices][station_col].values


shape: (3_110_542, 17)
┌───────────────┬──────────────┬──────────────┬──────────────┬───┬────────────┬──────────┬──────────────┬──────────────┐
│ ride_id       ┆ rideable_typ ┆ started_at   ┆ ended_at     ┆ … ┆ end_date   ┆ end_time ┆ duration     ┆ ride_validit │
│ ---           ┆ e            ┆ ---          ┆ ---          ┆   ┆ ---        ┆ ---      ┆ ---          ┆ y            │
│ str           ┆ ---          ┆ datetime[μs] ┆ datetime[μs] ┆   ┆ str        ┆ str      ┆ duration[μs] ┆ ---          │
│               ┆ str          ┆              ┆              ┆   ┆            ┆          ┆              ┆ str          │
╞═══════════════╪══════════════╪══════════════╪══════════════╪═══╪════════════╪══════════╪══════════════╪══════════════╡
│ A847FADBBC638 ┆ docked_bike  ┆ 2020-04-26   ┆ 2020-04-26   ┆ … ┆ 2020-04-26 ┆ 18:12:03 ┆ 26m 49s      ┆ Fair         │
│ E45           ┆              ┆ 17:45:14     ┆ 18:12:03     ┆   ┆            ┆          ┆              ┆              │
│ 5405B80

In [46]:
# Count missing values in start_station_name
missing_count = df_cleaned["end_station_name"].is_null().sum()
print(f"Missing values in end_station_name: {missing_count}")

Missing values in end_station_name: 0


In [48]:
df_cleaned = df_cleaned.with_columns(
    df_cleaned["duration"].dt.total_seconds().cast(pl.Float64).alias("duration_seconds")
)

In [50]:
df_cleaned.schema

Schema([('ride_id', String),
        ('rideable_type', String),
        ('started_at', Datetime(time_unit='us', time_zone=None)),
        ('ended_at', Datetime(time_unit='us', time_zone=None)),
        ('start_station_name', String),
        ('end_station_name', String),
        ('start_lat', String),
        ('start_lng', String),
        ('end_lat', String),
        ('end_lng', String),
        ('member_casual', String),
        ('start_date', String),
        ('start_time', String),
        ('end_date', String),
        ('end_time', String),
        ('duration', Duration(time_unit='us')),
        ('ride_validity', String),
        ('duration_seconds', Float64)])

In [52]:
df_cleaned

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity,duration_seconds
str,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str,duration[μs],str,f64
"""A847FADBBC638E45""","""docked_bike""",2020-04-26 17:45:14,2020-04-26 18:12:03,"""Eckhart Park""","""Lincoln Ave & Diversey Pkwy""","""41.8964""","""-87.661""","""41.9322""","""-87.6586""","""member""","""2020-04-26""","""17:45:14""","""2020-04-26""","""18:12:03""",26m 49s,"""Fair""",1609.0
"""5405B80E996FF60D""","""docked_bike""",2020-04-17 17:08:54,2020-04-17 17:17:03,"""Drake Ave & Fullerton Ave""","""Kosciuszko Park""","""41.9244""","""-87.7154""","""41.9306""","""-87.7238""","""member""","""2020-04-17""","""17:08:54""","""2020-04-17""","""17:17:03""",8m 9s,"""Fair""",489.0
"""5DD24A79A4E006F4""","""docked_bike""",2020-04-01 17:54:13,2020-04-01 18:08:36,"""McClurg Ct & Erie St""","""Indiana Ave & Roosevelt Rd""","""41.8945""","""-87.6179""","""41.8679""","""-87.623""","""member""","""2020-04-01""","""17:54:13""","""2020-04-01""","""18:08:36""",14m 23s,"""Fair""",863.0
"""2A59BBDF5CDBA725""","""docked_bike""",2020-04-07 12:50:19,2020-04-07 13:02:31,"""California Ave & Division St""","""Wood St & Augusta Blvd""","""41.903""","""-87.6975""","""41.8992""","""-87.6722""","""member""","""2020-04-07""","""12:50:19""","""2020-04-07""","""13:02:31""",12m 12s,"""Fair""",732.0
"""27AD306C119C6158""","""docked_bike""",2020-04-18 10:22:59,2020-04-18 11:15:54,"""Rush St & Hubbard St""","""Sheridan Rd & Lawrence Ave""","""41.8902""","""-87.6262""","""41.9695""","""-87.6547""","""casual""","""2020-04-18""","""10:22:59""","""2020-04-18""","""11:15:54""",52m 55s,"""Fair""",3175.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4631EE956BCEA738""","""classic_bike""",2020-12-19 13:59:33,2020-12-19 14:03:21,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""casual""","""2020-12-19""","""13:59:33""","""2020-12-19""","""14:03:21""",3m 48s,"""Fair""",228.0
"""D17CBEDEA8DBDFF6""","""electric_bike""",2020-12-20 15:48:41,2020-12-20 15:52:14,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836723166666665""","""-87.61336483333334""","""41.838722833333335""","""-87.6218535""","""member""","""2020-12-20""","""15:48:41""","""2020-12-20""","""15:52:14""",3m 33s,"""Fair""",213.0
"""447A6C67E9AF962E""","""docked_bike""",2020-12-02 16:59:58,2020-12-02 17:08:28,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""member""","""2020-12-02""","""16:59:58""","""2020-12-02""","""17:08:28""",8m 30s,"""Fair""",510.0
"""F558C17E95751C62""","""electric_bike""",2020-12-20 18:06:52,2020-12-20 18:09:53,"""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836734166666666""","""-87.61339383333333""","""41.8387495""","""-87.621862""","""member""","""2020-12-20""","""18:06:52""","""2020-12-20""","""18:09:53""",3m 1s,"""Fair""",181.0


In [54]:
df_cleaned = df_cleaned.drop("duration_seconds")


In [56]:
df_cleaned = df_cleaned.with_columns([
    df_cleaned["started_at"].cast(pl.Utf8).alias("started_at"),
    df_cleaned["ended_at"].cast(pl.Utf8).alias("ended_at"),
    df_cleaned["duration"].dt.total_seconds().cast(pl.Utf8).alias("duration")  # Convert to seconds, then string
])


In [57]:
df_cleaned

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""A847FADBBC638E45""","""docked_bike""","""2020-04-26 17:45:14.000000""","""2020-04-26 18:12:03.000000""","""Eckhart Park""","""Lincoln Ave & Diversey Pkwy""","""41.8964""","""-87.661""","""41.9322""","""-87.6586""","""member""","""2020-04-26""","""17:45:14""","""2020-04-26""","""18:12:03""","""1609""","""Fair"""
"""5405B80E996FF60D""","""docked_bike""","""2020-04-17 17:08:54.000000""","""2020-04-17 17:17:03.000000""","""Drake Ave & Fullerton Ave""","""Kosciuszko Park""","""41.9244""","""-87.7154""","""41.9306""","""-87.7238""","""member""","""2020-04-17""","""17:08:54""","""2020-04-17""","""17:17:03""","""489""","""Fair"""
"""5DD24A79A4E006F4""","""docked_bike""","""2020-04-01 17:54:13.000000""","""2020-04-01 18:08:36.000000""","""McClurg Ct & Erie St""","""Indiana Ave & Roosevelt Rd""","""41.8945""","""-87.6179""","""41.8679""","""-87.623""","""member""","""2020-04-01""","""17:54:13""","""2020-04-01""","""18:08:36""","""863""","""Fair"""
"""2A59BBDF5CDBA725""","""docked_bike""","""2020-04-07 12:50:19.000000""","""2020-04-07 13:02:31.000000""","""California Ave & Division St""","""Wood St & Augusta Blvd""","""41.903""","""-87.6975""","""41.8992""","""-87.6722""","""member""","""2020-04-07""","""12:50:19""","""2020-04-07""","""13:02:31""","""732""","""Fair"""
"""27AD306C119C6158""","""docked_bike""","""2020-04-18 10:22:59.000000""","""2020-04-18 11:15:54.000000""","""Rush St & Hubbard St""","""Sheridan Rd & Lawrence Ave""","""41.8902""","""-87.6262""","""41.9695""","""-87.6547""","""casual""","""2020-04-18""","""10:22:59""","""2020-04-18""","""11:15:54""","""3175""","""Fair"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4631EE956BCEA738""","""classic_bike""","""2020-12-19 13:59:33.000000""","""2020-12-19 14:03:21.000000""","""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""casual""","""2020-12-19""","""13:59:33""","""2020-12-19""","""14:03:21""","""228""","""Fair"""
"""D17CBEDEA8DBDFF6""","""electric_bike""","""2020-12-20 15:48:41.000000""","""2020-12-20 15:52:14.000000""","""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836723166666665""","""-87.61336483333334""","""41.838722833333335""","""-87.6218535""","""member""","""2020-12-20""","""15:48:41""","""2020-12-20""","""15:52:14""","""213""","""Fair"""
"""447A6C67E9AF962E""","""docked_bike""","""2020-12-02 16:59:58.000000""","""2020-12-02 17:08:28.000000""","""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""member""","""2020-12-02""","""16:59:58""","""2020-12-02""","""17:08:28""","""510""","""Fair"""
"""F558C17E95751C62""","""electric_bike""","""2020-12-20 18:06:52.000000""","""2020-12-20 18:09:53.000000""","""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836734166666666""","""-87.61339383333333""","""41.8387495""","""-87.621862""","""member""","""2020-12-20""","""18:06:52""","""2020-12-20""","""18:09:53""","""181""","""Fair"""


In [58]:
df_cleaned = df_cleaned.with_columns([
    df_cleaned["started_at"].str.slice(0, 19).alias("started_at"),  # Keep only YYYY-MM-DD HH:MM:SS
    df_cleaned["ended_at"].str.slice(0, 19).alias("ended_at")
])


In [59]:
df_cleaned

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""A847FADBBC638E45""","""docked_bike""","""2020-04-26 17:45:14""","""2020-04-26 18:12:03""","""Eckhart Park""","""Lincoln Ave & Diversey Pkwy""","""41.8964""","""-87.661""","""41.9322""","""-87.6586""","""member""","""2020-04-26""","""17:45:14""","""2020-04-26""","""18:12:03""","""1609""","""Fair"""
"""5405B80E996FF60D""","""docked_bike""","""2020-04-17 17:08:54""","""2020-04-17 17:17:03""","""Drake Ave & Fullerton Ave""","""Kosciuszko Park""","""41.9244""","""-87.7154""","""41.9306""","""-87.7238""","""member""","""2020-04-17""","""17:08:54""","""2020-04-17""","""17:17:03""","""489""","""Fair"""
"""5DD24A79A4E006F4""","""docked_bike""","""2020-04-01 17:54:13""","""2020-04-01 18:08:36""","""McClurg Ct & Erie St""","""Indiana Ave & Roosevelt Rd""","""41.8945""","""-87.6179""","""41.8679""","""-87.623""","""member""","""2020-04-01""","""17:54:13""","""2020-04-01""","""18:08:36""","""863""","""Fair"""
"""2A59BBDF5CDBA725""","""docked_bike""","""2020-04-07 12:50:19""","""2020-04-07 13:02:31""","""California Ave & Division St""","""Wood St & Augusta Blvd""","""41.903""","""-87.6975""","""41.8992""","""-87.6722""","""member""","""2020-04-07""","""12:50:19""","""2020-04-07""","""13:02:31""","""732""","""Fair"""
"""27AD306C119C6158""","""docked_bike""","""2020-04-18 10:22:59""","""2020-04-18 11:15:54""","""Rush St & Hubbard St""","""Sheridan Rd & Lawrence Ave""","""41.8902""","""-87.6262""","""41.9695""","""-87.6547""","""casual""","""2020-04-18""","""10:22:59""","""2020-04-18""","""11:15:54""","""3175""","""Fair"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4631EE956BCEA738""","""classic_bike""","""2020-12-19 13:59:33""","""2020-12-19 14:03:21""","""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""casual""","""2020-12-19""","""13:59:33""","""2020-12-19""","""14:03:21""","""228""","""Fair"""
"""D17CBEDEA8DBDFF6""","""electric_bike""","""2020-12-20 15:48:41""","""2020-12-20 15:52:14""","""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836723166666665""","""-87.61336483333334""","""41.838722833333335""","""-87.6218535""","""member""","""2020-12-20""","""15:48:41""","""2020-12-20""","""15:52:14""","""213""","""Fair"""
"""447A6C67E9AF962E""","""docked_bike""","""2020-12-02 16:59:58""","""2020-12-02 17:08:28""","""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836208""","""-87.613533""","""41.838842""","""-87.621857""","""member""","""2020-12-02""","""16:59:58""","""2020-12-02""","""17:08:28""","""510""","""Fair"""
"""F558C17E95751C62""","""electric_bike""","""2020-12-20 18:06:52""","""2020-12-20 18:09:53""","""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""","""41.836734166666666""","""-87.61339383333333""","""41.8387495""","""-87.621862""","""member""","""2020-12-20""","""18:06:52""","""2020-12-20""","""18:09:53""","""181""","""Fair"""


In [65]:
df_cleaned.write_csv("cleaned_cyclistic_data_2020.csv")
print("File saved as cleaned_cyclistic_data_2020.csv")


File saved as cleaned_cyclistic_data_2020.csv


In [66]:
# Count the number of null values in each column
df_cleaned.select([pl.col(col).is_null().sum().alias(f"{col}_null_count") for col in df_cleaned.columns])


ride_id_null_count,rideable_type_null_count,started_at_null_count,ended_at_null_count,start_station_name_null_count,end_station_name_null_count,start_lat_null_count,start_lng_null_count,end_lat_null_count,end_lng_null_count,member_casual_null_count,start_date_null_count,start_time_null_count,end_date_null_count,end_time_null_count,duration_null_count,ride_validity_null_count
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [1]:
import polars as pl

In [5]:
_2020 = pl.read_csv(r"C:\Users\Sowjanya\Documents\BIIKE-SHARE-DATA\all_files_Cyclist\final_2020.csv")

In [7]:
_2020

Unnamed: 0_level_0,ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity,day_period,month_name,season,weekday,week_type,duration_seconds,duration_minutes,total_hours
i64,str,str,str,str,str,str,f64,f64,f64,f64,str,str,str,str,str,i64,str,str,str,str,str,str,f64,f64,f64
0,"""A847FADBBC638E45""","""docked_bike""","""2020-04-26 17:45:14""","""2020-04-26 18:12:03""","""Eckhart Park""","""Lincoln Ave & Diversey Pkwy""",41.8964,-87.661,41.9322,-87.6586,"""member""","""2020-04-26""","""17:45:14""","""2020-04-26""","""18:12:03""",1609,"""Fair""","""Evening""","""April""","""Spring""","""Sunday""","""Weekend""",1609.0,26.49,0.0
1,"""5405B80E996FF60D""","""docked_bike""","""2020-04-17 17:08:54""","""2020-04-17 17:17:03""","""Drake Ave & Fullerton Ave""","""Kosciuszko Park""",41.9244,-87.7154,41.9306,-87.7238,"""member""","""2020-04-17""","""17:08:54""","""2020-04-17""","""17:17:03""",489,"""Fair""","""Evening""","""April""","""Spring""","""Friday""","""Weekday""",489.0,8.09,0.0
2,"""5DD24A79A4E006F4""","""docked_bike""","""2020-04-01 17:54:13""","""2020-04-01 18:08:36""","""McClurg Ct & Erie St""","""Indiana Ave & Roosevelt Rd""",41.8945,-87.6179,41.8679,-87.623,"""member""","""2020-04-01""","""17:54:13""","""2020-04-01""","""18:08:36""",863,"""Fair""","""Evening""","""April""","""Spring""","""Wednesday""","""Weekday""",863.0,14.23,0.0
3,"""2A59BBDF5CDBA725""","""docked_bike""","""2020-04-07 12:50:19""","""2020-04-07 13:02:31""","""California Ave & Division St""","""Wood St & Augusta Blvd""",41.903,-87.6975,41.8992,-87.6722,"""member""","""2020-04-07""","""12:50:19""","""2020-04-07""","""13:02:31""",732,"""Fair""","""Afternoon""","""April""","""Spring""","""Tuesday""","""Weekday""",732.0,12.12,0.0
4,"""27AD306C119C6158""","""docked_bike""","""2020-04-18 10:22:59""","""2020-04-18 11:15:54""","""Rush St & Hubbard St""","""Sheridan Rd & Lawrence Ave""",41.8902,-87.6262,41.9695,-87.6547,"""casual""","""2020-04-18""","""10:22:59""","""2020-04-18""","""11:15:54""",3175,"""Fair""","""Morning""","""April""","""Spring""","""Saturday""","""Weekend""",3175.0,52.55,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
3110537,"""4631EE956BCEA738""","""classic_bike""","""2020-12-19 13:59:33""","""2020-12-19 14:03:21""","""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""",41.836208,-87.613533,41.838842,-87.621857,"""casual""","""2020-12-19""","""13:59:33""","""2020-12-19""","""14:03:21""",228,"""Fair""","""Afternoon""","""December""","""Winter""","""Saturday""","""Weekend""",228.0,3.48,0.0
3110538,"""D17CBEDEA8DBDFF6""","""electric_bike""","""2020-12-20 15:48:41""","""2020-12-20 15:52:14""","""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""",41.836723,-87.613365,41.838723,-87.621854,"""member""","""2020-12-20""","""15:48:41""","""2020-12-20""","""15:52:14""",213,"""Fair""","""Afternoon""","""December""","""Winter""","""Sunday""","""Weekend""",213.0,3.33,0.0
3110539,"""447A6C67E9AF962E""","""docked_bike""","""2020-12-02 16:59:58""","""2020-12-02 17:08:28""","""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""",41.836208,-87.613533,41.838842,-87.621857,"""member""","""2020-12-02""","""16:59:58""","""2020-12-02""","""17:08:28""",510,"""Fair""","""Afternoon""","""December""","""Winter""","""Wednesday""","""Weekday""",510.0,8.3,0.0
3110540,"""F558C17E95751C62""","""electric_bike""","""2020-12-20 18:06:52""","""2020-12-20 18:09:53""","""Rhodes Ave & 32nd St""","""Indiana Ave & 31st St""",41.836734,-87.613394,41.838749,-87.621862,"""member""","""2020-12-20""","""18:06:52""","""2020-12-20""","""18:09:53""",181,"""Fair""","""Evening""","""December""","""Winter""","""Sunday""","""Weekend""",181.0,3.01,0.0


pip install sqlalchemy

In [41]:
from sqlalchemy import create_engine

In [42]:
pip install sqlalchemy mysql-connector-python pymysql


Note: you may need to restart the kernel to use updated packages.


import mysql.connector

# Connection without specifying the database
connection = mysql.connector.connect(
    host="localhost",
    port=3305,
    user="root",
    password="root"
)

cursor = connection.cursor()

# Create database if not exists
cursor.execute("CREATE DATABASE IF NOT EXISTS bike_share")

# Now connect to the database
connection.database = "bike_share"

print("Database connected successfully!")

In [54]:
import mysql.connector

# Connect to MySQL database
connection = mysql.connector.connect(
    host="localhost",
    port=3305,
    user="root",
    password="root",
    database="bike_share"
)
cursor = connection.cursor()

# Create table query
create_table_query = """
CREATE TABLE IF NOT EXISTS bike_rides_2020 (
    ride_id TEXT,
    rideable_type TEXT,
    started_at TEXT,
    ended_at TEXT,
    start_station_name TEXT,
    end_station_name TEXT,
    start_lat DOUBLE,
    start_lng DOUBLE,
    end_lat DOUBLE,
    end_lng DOUBLE,
    member_casual TEXT,
    start_date TEXT,
    start_time TEXT,
    end_date TEXT,
    end_time TEXT,
    duration INT,
    ride_validity TEXT
);
"""

# Execute the query
cursor.execute(create_table_query)
connection.commit()

print("✅ Table 'bike_rides_2020' created successfully!")

# Close connection
cursor.close()
connection.close()


✅ Table 'bike_rides_2020' created successfully!


In [58]:
import mysql.connector

# Assuming df_cleaned is your cleaned Polars DataFrame
df_cleaned = df_cleaned.unique()  # Ensure unique records

# Convert Polars DataFrame to a list of tuples
data = [tuple(row) for row in df_cleaned.to_numpy()]

# Connect to MySQL without specifying the database first
connection = mysql.connector.connect(
    host="localhost",
    port=3305,
    user="root",
    password="root"
)
cursor = connection.cursor()

# Create database if not exists
cursor.execute("CREATE DATABASE IF NOT EXISTS bike_share")

# Connect to the database
connection.database = "bike_share"

print("✅ Database connected successfully!")

# Ensure the `bike_rides_2020` table exists (Modify the schema if needed)
cursor.execute("""
CREATE TABLE IF NOT EXISTS bike_rides_2020 (
    ride_id TEXT PRIMARY KEY,
    rideable_type TEXT,
    started_at TEXT,
    ended_at TEXT,
    start_station_name TEXT,
    end_station_name TEXT,
    start_lat DOUBLE,
    start_lng DOUBLE,
    end_lat DOUBLE,
    end_lng DOUBLE,
    member_casual TEXT,
    start_date TEXT,
    start_time TEXT,
    end_date TEXT,
    end_time TEXT,
    duration INT,
    ride_validity TEXT
)
""")

print("✅ Table checked/created successfully!")

# SQL Insert Query (Ensure column names match your table)
insert_stmt = """
INSERT IGNORE INTO bike_rides_2020 (
    ride_id, rideable_type, started_at, ended_at, start_station_name, 
    end_station_name, start_lat, start_lng, end_lat, end_lng, 
    member_casual, start_date, start_time, end_date, end_time, 
    duration, ride_validity
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# Insert data in batches for efficiency
batch_size = 1000
for i in range(0, len(data), batch_size):
    batch = data[i:i + batch_size]
    cursor.executemany(insert_stmt, batch)
    connection.commit()

print(f"✅ Inserted {len(data)} unique records successfully into `bike_rides_2020`!")

# Close connection
cursor.close()
connection.close()


✅ Database connected successfully!
✅ Table checked/created successfully!
✅ Inserted 3110542 unique records successfully into `bike_rides_2020`!
