In [1]:
import os
import polars as pl

# List of folder paths
folder_paths = [
    r"C:\Users\Sowjanya\OneDrive\Desktop\Capstone\DATA_2024"
]

# Create an empty list to store DataFrames
dataframes = []

# Loop through each folder
for folder_path in folder_paths:
    # Get a list of all CSV files in the folder
    file_list = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    # Read each file and append to the list
    for file in file_list:
        file_path = os.path.join(folder_path, file)
        df = pl.read_csv(file_path, infer_schema_length=10000)  # Increase schema inference
        dataframes.append(df.with_columns([
            pl.col(col).cast(pl.Utf8) for col in df.columns  # Convert all columns to string
        ]))

# Concatenate all DataFrames into one
df_final = pl.concat(dataframes, how="vertical_relaxed")  # Allow mismatched schemas

# Display the first few rows
print(df_final.head())

shape: (5, 13)
┌──────────────┬──────────────┬──────────────┬─────────────┬───┬─────────────┬─────────────┬─────────────┬─────────────┐
│ ride_id      ┆ rideable_typ ┆ started_at   ┆ ended_at    ┆ … ┆ start_lng   ┆ end_lat     ┆ end_lng     ┆ member_casu │
│ ---          ┆ e            ┆ ---          ┆ ---         ┆   ┆ ---         ┆ ---         ┆ ---         ┆ al          │
│ str          ┆ ---          ┆ str          ┆ str         ┆   ┆ str         ┆ str         ┆ str         ┆ ---         │
│              ┆ str          ┆              ┆             ┆   ┆             ┆             ┆             ┆ str         │
╞══════════════╪══════════════╪══════════════╪═════════════╪═══╪═════════════╪═════════════╪═════════════╪═════════════╡
│ C1D650626C8C ┆ electric_bik ┆ 2024-01-12   ┆ 2024-01-12  ┆ … ┆ -87.6347367 ┆ 41.88917683 ┆ -87.6385057 ┆ member      │
│ 899A         ┆ e            ┆ 15:30:27     ┆ 15:37:59    ┆   ┆ 76          ┆ 258         ┆ 718         ┆             │
│ EECD38BDB25B ┆ 

In [3]:
df = df_final.clone()

In [5]:
df

ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
str,str,str,str,str,str,str,str,str,str,str,str,str
"""C1D650626C8C899A""","""electric_bike""","""2024-01-12 15:30:27""","""2024-01-12 15:37:59""","""Wells St & Elm St""","""KA1504000135""","""Kingsbury St & Kinzie St""","""KA1503000043""","""41.903267384""","""-87.634736776""","""41.88917683258""","""-87.6385057718""","""member"""
"""EECD38BDB25BFCB0""","""electric_bike""","""2024-01-08 15:45:46""","""2024-01-08 15:52:59""","""Wells St & Elm St""","""KA1504000135""","""Kingsbury St & Kinzie St""","""KA1503000043""","""41.9029365""","""-87.63444016666666""","""41.88917683258""","""-87.6385057718""","""member"""
"""F4A9CE78061F17F7""","""electric_bike""","""2024-01-27 12:27:19""","""2024-01-27 12:35:19""","""Wells St & Elm St""","""KA1504000135""","""Kingsbury St & Kinzie St""","""KA1503000043""","""41.902951333333334""","""-87.63447033333334""","""41.88917683258""","""-87.6385057718""","""member"""
"""0A0D9E15EE50B171""","""classic_bike""","""2024-01-29 16:26:17""","""2024-01-29 16:56:06""","""Wells St & Randolph St""","""TA1305000030""","""Larrabee St & Webster Ave""","""13193""","""41.884295""","""-87.633963""","""41.921822""","""-87.64414""","""member"""
"""33FFC9805E3EFF9A""","""classic_bike""","""2024-01-31 05:43:23""","""2024-01-31 06:09:35""","""Lincoln Ave & Waveland Ave""","""13253""","""Kingsbury St & Kinzie St""","""KA1503000043""","""41.948797""","""-87.675278""","""41.88917683258""","""-87.6385057718""","""member"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""BD56BA20F42E4794""","""electric_bike""","""2024-12-11 08:23:46.564""","""2024-12-11 08:37:34.532""","""Clybourn Ave & Division St""","""TA1307000115""",,,"""41.904633522""","""-87.64051795""","""41.88""","""-87.63""","""member"""
"""3074643A6B60B300""","""electric_bike""","""2024-12-09 12:26:15.677""","""2024-12-09 12:37:32.712""","""Canal St & Jackson Blvd""","""13138""",,,"""41.878125""","""-87.639968""","""41.9""","""-87.62""","""member"""
"""15602635C5DF484E""","""electric_bike""","""2024-12-31 17:10:03.113""","""2024-12-31 17:17:21.838""","""Albany Ave & Bloomingdale Ave""","""15655""","""California Ave & Milwaukee Ave""","""13084""","""41.91402671273""","""-87.705126462""","""41.922695""","""-87.697153""","""member"""
"""F15ABBA961560B75""","""electric_bike""","""2024-12-01 14:39:47.216""","""2024-12-01 14:45:21.268""","""Albany Ave & Bloomingdale Ave""","""15655""","""California Ave & Milwaukee Ave""","""13084""","""41.914002657""","""-87.705099225""","""41.922695""","""-87.697153""","""member"""


In [7]:
# Step 1: Count missing values in each column
missing_values = df.null_count()
print(missing_values)

shape: (1, 13)
┌─────────┬───────────────┬────────────┬──────────┬───┬───────────┬─────────┬─────────┬───────────────┐
│ ride_id ┆ rideable_type ┆ started_at ┆ ended_at ┆ … ┆ start_lng ┆ end_lat ┆ end_lng ┆ member_casual │
│ ---     ┆ ---           ┆ ---        ┆ ---      ┆   ┆ ---       ┆ ---     ┆ ---     ┆ ---           │
│ u32     ┆ u32           ┆ u32        ┆ u32      ┆   ┆ u32       ┆ u32     ┆ u32     ┆ u32           │
╞═════════╪═══════════════╪════════════╪══════════╪═══╪═══════════╪═════════╪═════════╪═══════════════╡
│ 0       ┆ 0             ┆ 0          ┆ 0        ┆ … ┆ 0         ┆ 7232    ┆ 7232    ┆ 0             │
└─────────┴───────────────┴────────────┴──────────┴───┴───────────┴─────────┴─────────┴───────────────┘


In [9]:
# Step 2: Identify and store records where both 'end_lat' and 'end_lng' are null
missing_end_lat_lng = df.filter(pl.col("end_lat").is_null() & pl.col("end_lng").is_null()).clone()

# Step 3: Remove these records from the original DataFrame
df_cleaned = df.drop_nulls(subset=["end_lat", "end_lng"])

# Display results
print(f"Number of removed records: {missing_end_lat_lng.shape[0]}")
print("\nRecords removed (stored separately):")
print(missing_end_lat_lng)

print("\nUpdated DataFrame after removal:")
print(df_cleaned)

Number of removed records: 7232

Records removed (stored separately):
shape: (7_232, 13)
┌───────────────┬───────────────┬───────────────┬───────────────┬───┬───────────────┬─────────┬─────────┬──────────────┐
│ ride_id       ┆ rideable_type ┆ started_at    ┆ ended_at      ┆ … ┆ start_lng     ┆ end_lat ┆ end_lng ┆ member_casua │
│ ---           ┆ ---           ┆ ---           ┆ ---           ┆   ┆ ---           ┆ ---     ┆ ---     ┆ l            │
│ str           ┆ str           ┆ str           ┆ str           ┆   ┆ str           ┆ str     ┆ str     ┆ ---          │
│               ┆               ┆               ┆               ┆   ┆               ┆         ┆         ┆ str          │
╞═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══╪═══════════════╪═════════╪═════════╪══════════════╡
│ 16D623E15D50B ┆ classic_bike  ┆ 2024-01-17    ┆ 2024-01-18    ┆ … ┆ -87.634324    ┆ null    ┆ null    ┆ member       │
│ DC4           ┆               ┆ 07:27:37      ┆ 08:27:28      

In [11]:
df_cleaned = df_cleaned.with_columns(
    pl.col("started_at").str.split(" ").alias("split_col")
).with_columns(
    pl.col("split_col").list.get(0).alias("start_date"),
    pl.col("split_col").list.get(1).alias("start_time")
).drop("split_col")  # Optional: Remove the temporary column

In [13]:
# Split 'started_at' into 'start_date' and 'start_time'
df_cleaned = df_cleaned.with_columns(
    pl.col("started_at").str.split(" ").alias("split_start")
).with_columns(
    pl.col("split_start").list.get(0).alias("start_date"),
    pl.col("split_start").list.get(1).alias("start_time")
).drop("split_start")  # Optional cleanup

# Remove milliseconds from 'start_time'
df_cleaned = df_cleaned.with_columns(
    pl.col("start_time").str.split(".").list.get(0)
)

# Convert 'start_date' and 'start_time' to datetime
df_cleaned = df_cleaned.with_columns(
    (pl.col("start_date") + " " + pl.col("start_time")).str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S").alias("started_at")
)

# Split 'ended_at' into 'end_date' and 'end_time'
df_cleaned = df_cleaned.with_columns(
    pl.col("ended_at").str.split(" ").alias("split_end")
).with_columns(
    pl.col("split_end").list.get(0).alias("end_date"),
    pl.col("split_end").list.get(1).alias("end_time")
).drop("split_end")  # Optional cleanup

# Remove milliseconds from 'end_time'
df_cleaned = df_cleaned.with_columns(
    pl.col("end_time").str.split(".").list.get(0)
)


In [15]:
# Convert 'start_date' and 'start_time' to datetime
df_cleaned = df_cleaned.with_columns(
    (pl.col("end_date") + " " + pl.col("end_time")).str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S").alias("ended_at")
)

In [17]:
df_cleaned

ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time
str,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str,str,str
"""C1D650626C8C899A""","""electric_bike""",2024-01-12 15:30:27,2024-01-12 15:37:59,"""Wells St & Elm St""","""KA1504000135""","""Kingsbury St & Kinzie St""","""KA1503000043""","""41.903267384""","""-87.634736776""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-12""","""15:30:27""","""2024-01-12""","""15:37:59"""
"""EECD38BDB25BFCB0""","""electric_bike""",2024-01-08 15:45:46,2024-01-08 15:52:59,"""Wells St & Elm St""","""KA1504000135""","""Kingsbury St & Kinzie St""","""KA1503000043""","""41.9029365""","""-87.63444016666666""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-08""","""15:45:46""","""2024-01-08""","""15:52:59"""
"""F4A9CE78061F17F7""","""electric_bike""",2024-01-27 12:27:19,2024-01-27 12:35:19,"""Wells St & Elm St""","""KA1504000135""","""Kingsbury St & Kinzie St""","""KA1503000043""","""41.902951333333334""","""-87.63447033333334""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-27""","""12:27:19""","""2024-01-27""","""12:35:19"""
"""0A0D9E15EE50B171""","""classic_bike""",2024-01-29 16:26:17,2024-01-29 16:56:06,"""Wells St & Randolph St""","""TA1305000030""","""Larrabee St & Webster Ave""","""13193""","""41.884295""","""-87.633963""","""41.921822""","""-87.64414""","""member""","""2024-01-29""","""16:26:17""","""2024-01-29""","""16:56:06"""
"""33FFC9805E3EFF9A""","""classic_bike""",2024-01-31 05:43:23,2024-01-31 06:09:35,"""Lincoln Ave & Waveland Ave""","""13253""","""Kingsbury St & Kinzie St""","""KA1503000043""","""41.948797""","""-87.675278""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-31""","""05:43:23""","""2024-01-31""","""06:09:35"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""BD56BA20F42E4794""","""electric_bike""",2024-12-11 08:23:46,2024-12-11 08:37:34,"""Clybourn Ave & Division St""","""TA1307000115""",,,"""41.904633522""","""-87.64051795""","""41.88""","""-87.63""","""member""","""2024-12-11""","""08:23:46""","""2024-12-11""","""08:37:34"""
"""3074643A6B60B300""","""electric_bike""",2024-12-09 12:26:15,2024-12-09 12:37:32,"""Canal St & Jackson Blvd""","""13138""",,,"""41.878125""","""-87.639968""","""41.9""","""-87.62""","""member""","""2024-12-09""","""12:26:15""","""2024-12-09""","""12:37:32"""
"""15602635C5DF484E""","""electric_bike""",2024-12-31 17:10:03,2024-12-31 17:17:21,"""Albany Ave & Bloomingdale Ave""","""15655""","""California Ave & Milwaukee Ave""","""13084""","""41.91402671273""","""-87.705126462""","""41.922695""","""-87.697153""","""member""","""2024-12-31""","""17:10:03""","""2024-12-31""","""17:17:21"""
"""F15ABBA961560B75""","""electric_bike""",2024-12-01 14:39:47,2024-12-01 14:45:21,"""Albany Ave & Bloomingdale Ave""","""15655""","""California Ave & Milwaukee Ave""","""13084""","""41.914002657""","""-87.705099225""","""41.922695""","""-87.697153""","""member""","""2024-12-01""","""14:39:47""","""2024-12-01""","""14:45:21"""


In [19]:
df_cleaned = df_cleaned.drop(["start_station_id", "end_station_id"])


In [21]:
df_cleaned = df_cleaned.with_columns(
    (pl.col("ended_at") - pl.col("started_at")).alias("duration")
)


In [23]:
df_filtered = df_cleaned.filter(pl.col("duration") < pl.duration(seconds=120))

print(df_filtered)


shape: (241_671, 16)
┌───────────────┬───────────────┬───────────────┬──────────────┬───┬────────────┬────────────┬──────────┬──────────────┐
│ ride_id       ┆ rideable_type ┆ started_at    ┆ ended_at     ┆ … ┆ start_time ┆ end_date   ┆ end_time ┆ duration     │
│ ---           ┆ ---           ┆ ---           ┆ ---          ┆   ┆ ---        ┆ ---        ┆ ---      ┆ ---          │
│ str           ┆ str           ┆ datetime[μs]  ┆ datetime[μs] ┆   ┆ str        ┆ str        ┆ str      ┆ duration[μs] │
╞═══════════════╪═══════════════╪═══════════════╪══════════════╪═══╪════════════╪════════════╪══════════╪══════════════╡
│ 0C94B9855F6B7 ┆ classic_bike  ┆ 2024-01-18    ┆ 2024-01-18   ┆ … ┆ 16:41:31   ┆ 2024-01-18 ┆ 16:43:05 ┆ 1m 34s       │
│ 0CA           ┆               ┆ 16:41:31      ┆ 16:43:05     ┆   ┆            ┆            ┆          ┆              │
│ BD70C53ED25F6 ┆ classic_bike  ┆ 2024-01-11    ┆ 2024-01-11   ┆ … ┆ 14:38:22   ┆ 2024-01-11 ┆ 14:39:29 ┆ 1m 7s        │
│ F5A      

In [25]:
df_filtered = df_cleaned.with_columns(
    pl.when(pl.col("duration") < pl.duration(minutes=2))
    .then(pl.lit("Suspicious"))
    .otherwise(pl.lit("Fair"))
    .alias("ride_validity")
)

print(df_filtered)


shape: (5_853_336, 17)
┌───────────────┬──────────────┬──────────────┬──────────────┬───┬────────────┬──────────┬──────────────┬──────────────┐
│ ride_id       ┆ rideable_typ ┆ started_at   ┆ ended_at     ┆ … ┆ end_date   ┆ end_time ┆ duration     ┆ ride_validit │
│ ---           ┆ e            ┆ ---          ┆ ---          ┆   ┆ ---        ┆ ---      ┆ ---          ┆ y            │
│ str           ┆ ---          ┆ datetime[μs] ┆ datetime[μs] ┆   ┆ str        ┆ str      ┆ duration[μs] ┆ ---          │
│               ┆ str          ┆              ┆              ┆   ┆            ┆          ┆              ┆ str          │
╞═══════════════╪══════════════╪══════════════╪══════════════╪═══╪════════════╪══════════╪══════════════╪══════════════╡
│ C1D650626C8C8 ┆ electric_bik ┆ 2024-01-12   ┆ 2024-01-12   ┆ … ┆ 2024-01-12 ┆ 15:37:59 ┆ 7m 32s       ┆ Fair         │
│ 99A           ┆ e            ┆ 15:30:27     ┆ 15:37:59     ┆   ┆            ┆          ┆              ┆              │
│ EECD38B

In [27]:
df_cleaned = df_cleaned.with_columns([
    df_cleaned["started_at"].dt.truncate("1s").alias("started_at"),
    df_cleaned["ended_at"].dt.truncate("1s").alias("ended_at")
])


In [29]:
df = df_filtered.clone()

In [31]:
df

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity
str,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str,duration[μs],str
"""C1D650626C8C899A""","""electric_bike""",2024-01-12 15:30:27,2024-01-12 15:37:59,"""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.903267384""","""-87.634736776""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-12""","""15:30:27""","""2024-01-12""","""15:37:59""",7m 32s,"""Fair"""
"""EECD38BDB25BFCB0""","""electric_bike""",2024-01-08 15:45:46,2024-01-08 15:52:59,"""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.9029365""","""-87.63444016666666""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-08""","""15:45:46""","""2024-01-08""","""15:52:59""",7m 13s,"""Fair"""
"""F4A9CE78061F17F7""","""electric_bike""",2024-01-27 12:27:19,2024-01-27 12:35:19,"""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.902951333333334""","""-87.63447033333334""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-27""","""12:27:19""","""2024-01-27""","""12:35:19""",8m,"""Fair"""
"""0A0D9E15EE50B171""","""classic_bike""",2024-01-29 16:26:17,2024-01-29 16:56:06,"""Wells St & Randolph St""","""Larrabee St & Webster Ave""","""41.884295""","""-87.633963""","""41.921822""","""-87.64414""","""member""","""2024-01-29""","""16:26:17""","""2024-01-29""","""16:56:06""",29m 49s,"""Fair"""
"""33FFC9805E3EFF9A""","""classic_bike""",2024-01-31 05:43:23,2024-01-31 06:09:35,"""Lincoln Ave & Waveland Ave""","""Kingsbury St & Kinzie St""","""41.948797""","""-87.675278""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-31""","""05:43:23""","""2024-01-31""","""06:09:35""",26m 12s,"""Fair"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""BD56BA20F42E4794""","""electric_bike""",2024-12-11 08:23:46,2024-12-11 08:37:34,"""Clybourn Ave & Division St""",,"""41.904633522""","""-87.64051795""","""41.88""","""-87.63""","""member""","""2024-12-11""","""08:23:46""","""2024-12-11""","""08:37:34""",13m 48s,"""Fair"""
"""3074643A6B60B300""","""electric_bike""",2024-12-09 12:26:15,2024-12-09 12:37:32,"""Canal St & Jackson Blvd""",,"""41.878125""","""-87.639968""","""41.9""","""-87.62""","""member""","""2024-12-09""","""12:26:15""","""2024-12-09""","""12:37:32""",11m 17s,"""Fair"""
"""15602635C5DF484E""","""electric_bike""",2024-12-31 17:10:03,2024-12-31 17:17:21,"""Albany Ave & Bloomingdale Ave""","""California Ave & Milwaukee Ave""","""41.91402671273""","""-87.705126462""","""41.922695""","""-87.697153""","""member""","""2024-12-31""","""17:10:03""","""2024-12-31""","""17:17:21""",7m 18s,"""Fair"""
"""F15ABBA961560B75""","""electric_bike""",2024-12-01 14:39:47,2024-12-01 14:45:21,"""Albany Ave & Bloomingdale Ave""","""California Ave & Milwaukee Ave""","""41.914002657""","""-87.705099225""","""41.922695""","""-87.697153""","""member""","""2024-12-01""","""14:39:47""","""2024-12-01""","""14:45:21""",5m 34s,"""Fair"""


In [33]:
data_1 = df.clone()

In [35]:
data_1

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity
str,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str,duration[μs],str
"""C1D650626C8C899A""","""electric_bike""",2024-01-12 15:30:27,2024-01-12 15:37:59,"""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.903267384""","""-87.634736776""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-12""","""15:30:27""","""2024-01-12""","""15:37:59""",7m 32s,"""Fair"""
"""EECD38BDB25BFCB0""","""electric_bike""",2024-01-08 15:45:46,2024-01-08 15:52:59,"""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.9029365""","""-87.63444016666666""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-08""","""15:45:46""","""2024-01-08""","""15:52:59""",7m 13s,"""Fair"""
"""F4A9CE78061F17F7""","""electric_bike""",2024-01-27 12:27:19,2024-01-27 12:35:19,"""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.902951333333334""","""-87.63447033333334""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-27""","""12:27:19""","""2024-01-27""","""12:35:19""",8m,"""Fair"""
"""0A0D9E15EE50B171""","""classic_bike""",2024-01-29 16:26:17,2024-01-29 16:56:06,"""Wells St & Randolph St""","""Larrabee St & Webster Ave""","""41.884295""","""-87.633963""","""41.921822""","""-87.64414""","""member""","""2024-01-29""","""16:26:17""","""2024-01-29""","""16:56:06""",29m 49s,"""Fair"""
"""33FFC9805E3EFF9A""","""classic_bike""",2024-01-31 05:43:23,2024-01-31 06:09:35,"""Lincoln Ave & Waveland Ave""","""Kingsbury St & Kinzie St""","""41.948797""","""-87.675278""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-31""","""05:43:23""","""2024-01-31""","""06:09:35""",26m 12s,"""Fair"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""BD56BA20F42E4794""","""electric_bike""",2024-12-11 08:23:46,2024-12-11 08:37:34,"""Clybourn Ave & Division St""",,"""41.904633522""","""-87.64051795""","""41.88""","""-87.63""","""member""","""2024-12-11""","""08:23:46""","""2024-12-11""","""08:37:34""",13m 48s,"""Fair"""
"""3074643A6B60B300""","""electric_bike""",2024-12-09 12:26:15,2024-12-09 12:37:32,"""Canal St & Jackson Blvd""",,"""41.878125""","""-87.639968""","""41.9""","""-87.62""","""member""","""2024-12-09""","""12:26:15""","""2024-12-09""","""12:37:32""",11m 17s,"""Fair"""
"""15602635C5DF484E""","""electric_bike""",2024-12-31 17:10:03,2024-12-31 17:17:21,"""Albany Ave & Bloomingdale Ave""","""California Ave & Milwaukee Ave""","""41.91402671273""","""-87.705126462""","""41.922695""","""-87.697153""","""member""","""2024-12-31""","""17:10:03""","""2024-12-31""","""17:17:21""",7m 18s,"""Fair"""
"""F15ABBA961560B75""","""electric_bike""",2024-12-01 14:39:47,2024-12-01 14:45:21,"""Albany Ave & Bloomingdale Ave""","""California Ave & Milwaukee Ave""","""41.914002657""","""-87.705099225""","""41.922695""","""-87.697153""","""member""","""2024-12-01""","""14:39:47""","""2024-12-01""","""14:45:21""",5m 34s,"""Fair"""


##  Polars is optimized for performance, we can avoid slow row-wise operations by using a vectorized approach and efficient nearest-neighbor searches.

## Instead of iterating row by row, we can use Scipy's KDTree for fast nearest-neighbor lookups. Here’s how you can implement it in Polars:

In [38]:
import polars as pl
import numpy as np
from scipy.spatial import cKDTree  # Fast nearest-neighbor search

def find_nearest_place_polars(df: pl.DataFrame) -> pl.DataFrame:
    """
    Assigns the nearest place name to rows where 'start_station_name' is missing.
    Uses KDTree for fast nearest-neighbor lookup.
    """

    # Convert to Pandas for KDTree operations (only for lat/lng processing)
    df_pandas = df.to_pandas()

    # Extract valid and missing place data
    valid_places = df_pandas[df_pandas["start_station_name"].notna()]
    missing_places = df_pandas[df_pandas["start_station_name"].isna()]

    # Build KDTree for fast nearest neighbor search
    if not valid_places.empty:
        valid_coords = np.array(list(zip(valid_places["start_lat"], valid_places["start_lng"])))
        tree = cKDTree(valid_coords)

        # Find nearest valid station for each missing station
        if not missing_places.empty:
            missing_coords = np.array(list(zip(missing_places["start_lat"], missing_places["start_lng"])))
            _, nearest_indices = tree.query(missing_coords)

            # Assign nearest station names
            missing_places["start_station_name"] = valid_places.iloc[nearest_indices]["start_station_name"].values

            # Merge back the updated missing places
            df_pandas.loc[df_pandas["start_station_name"].isna(), "start_station_name"] = missing_places["start_station_name"]

    # Convert back to Polars
    return pl.from_pandas(df_pandas)

# Example usage
try:
    df_cleaned = pl.DataFrame(data_1)  # Ensure data_1 is defined
    df_cleaned = find_nearest_place_polars(df_cleaned)
    print(df_cleaned)
except NameError:
    print("Error: 'data_1' is not defined. Make sure your dataset exists.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_places["start_station_name"] = valid_places.iloc[nearest_indices]["start_station_name"].values


shape: (5_853_336, 17)
┌───────────────┬──────────────┬──────────────┬──────────────┬───┬────────────┬──────────┬──────────────┬──────────────┐
│ ride_id       ┆ rideable_typ ┆ started_at   ┆ ended_at     ┆ … ┆ end_date   ┆ end_time ┆ duration     ┆ ride_validit │
│ ---           ┆ e            ┆ ---          ┆ ---          ┆   ┆ ---        ┆ ---      ┆ ---          ┆ y            │
│ str           ┆ ---          ┆ datetime[μs] ┆ datetime[μs] ┆   ┆ str        ┆ str      ┆ duration[μs] ┆ ---          │
│               ┆ str          ┆              ┆              ┆   ┆            ┆          ┆              ┆ str          │
╞═══════════════╪══════════════╪══════════════╪══════════════╪═══╪════════════╪══════════╪══════════════╪══════════════╡
│ C1D650626C8C8 ┆ electric_bik ┆ 2024-01-12   ┆ 2024-01-12   ┆ … ┆ 2024-01-12 ┆ 15:37:59 ┆ 7m 32s       ┆ Fair         │
│ 99A           ┆ e            ┆ 15:30:27     ┆ 15:37:59     ┆   ┆            ┆          ┆              ┆              │
│ EECD38B

In [40]:
# Count missing values in start_station_name
missing_count = df_cleaned["start_station_name"].is_null().sum()
print(f"Missing values in start_station_name: {missing_count}")


Missing values in start_station_name: 0


In [42]:
import polars as pl
import numpy as np
from scipy.spatial import cKDTree  # Fast nearest-neighbor search

def find_nearest_place_polars(df: pl.DataFrame) -> pl.DataFrame:
    """
    Assigns the nearest place name to rows where 'start_station_name' and 'end_station_name' are missing.
    Uses KDTree for fast nearest-neighbor lookup.
    """

    # Convert Polars to Pandas for KDTree operations
    df_pandas = df.to_pandas()

    def fill_missing_station(df, station_col, lat_col, lng_col):
        """
        Helper function to fill missing station names based on nearest valid locations.
        """
        valid_places = df[df[station_col].notna()]
        missing_places = df[df[station_col].isna()]

        # Build KDTree for fast nearest neighbor search
        if not valid_places.empty:
            valid_coords = np.array(list(zip(valid_places[lat_col], valid_places[lng_col])))
            tree = cKDTree(valid_coords)

            # Find nearest valid station for each missing station
            if not missing_places.empty:
                missing_coords = np.array(list(zip(missing_places[lat_col], missing_places[lng_col])))
                _, nearest_indices = tree.query(missing_coords)

                # Assign nearest station names
                missing_places[station_col] = valid_places.iloc[nearest_indices][station_col].values

                # Merge back the updated missing places
                df.loc[df[station_col].isna(), station_col] = missing_places[station_col]

    # Fill missing values for start and end stations
    fill_missing_station(df_pandas, "start_station_name", "start_lat", "start_lng")
    fill_missing_station(df_pandas, "end_station_name", "end_lat", "end_lng")

    # Convert back to Polars
    return pl.from_pandas(df_pandas)

# Example usage
try:
    df_cleaned = pl.DataFrame(data_1)  # Ensure data_1 is defined
    df_cleaned = find_nearest_place_polars(df_cleaned)
    print(df_cleaned)
except NameError:
    print("Error: 'data_1' is not defined. Make sure your dataset exists.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_places[station_col] = valid_places.iloc[nearest_indices][station_col].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_places[station_col] = valid_places.iloc[nearest_indices][station_col].values


shape: (5_853_336, 17)
┌───────────────┬──────────────┬──────────────┬──────────────┬───┬────────────┬──────────┬──────────────┬──────────────┐
│ ride_id       ┆ rideable_typ ┆ started_at   ┆ ended_at     ┆ … ┆ end_date   ┆ end_time ┆ duration     ┆ ride_validit │
│ ---           ┆ e            ┆ ---          ┆ ---          ┆   ┆ ---        ┆ ---      ┆ ---          ┆ y            │
│ str           ┆ ---          ┆ datetime[μs] ┆ datetime[μs] ┆   ┆ str        ┆ str      ┆ duration[μs] ┆ ---          │
│               ┆ str          ┆              ┆              ┆   ┆            ┆          ┆              ┆ str          │
╞═══════════════╪══════════════╪══════════════╪══════════════╪═══╪════════════╪══════════╪══════════════╪══════════════╡
│ C1D650626C8C8 ┆ electric_bik ┆ 2024-01-12   ┆ 2024-01-12   ┆ … ┆ 2024-01-12 ┆ 15:37:59 ┆ 7m 32s       ┆ Fair         │
│ 99A           ┆ e            ┆ 15:30:27     ┆ 15:37:59     ┆   ┆            ┆          ┆              ┆              │
│ EECD38B

In [44]:
# Count missing values in end_station_name
missing_count = df_cleaned["end_station_name"].is_null().sum()
print(f"Missing values in end_station_name: {missing_count}")

Missing values in end_station_name: 0


In [46]:
df_cleaned = df_cleaned.with_columns(
    df_cleaned["duration"].dt.total_seconds().cast(pl.Float64).alias("duration_seconds")
)


In [48]:
df_cleaned.schema

Schema([('ride_id', String),
        ('rideable_type', String),
        ('started_at', Datetime(time_unit='us', time_zone=None)),
        ('ended_at', Datetime(time_unit='us', time_zone=None)),
        ('start_station_name', String),
        ('end_station_name', String),
        ('start_lat', String),
        ('start_lng', String),
        ('end_lat', String),
        ('end_lng', String),
        ('member_casual', String),
        ('start_date', String),
        ('start_time', String),
        ('end_date', String),
        ('end_time', String),
        ('duration', Duration(time_unit='us')),
        ('ride_validity', String),
        ('duration_seconds', Float64)])

In [50]:
df_cleaned

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity,duration_seconds
str,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str,duration[μs],str,f64
"""C1D650626C8C899A""","""electric_bike""",2024-01-12 15:30:27,2024-01-12 15:37:59,"""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.903267384""","""-87.634736776""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-12""","""15:30:27""","""2024-01-12""","""15:37:59""",7m 32s,"""Fair""",452.0
"""EECD38BDB25BFCB0""","""electric_bike""",2024-01-08 15:45:46,2024-01-08 15:52:59,"""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.9029365""","""-87.63444016666666""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-08""","""15:45:46""","""2024-01-08""","""15:52:59""",7m 13s,"""Fair""",433.0
"""F4A9CE78061F17F7""","""electric_bike""",2024-01-27 12:27:19,2024-01-27 12:35:19,"""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.902951333333334""","""-87.63447033333334""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-27""","""12:27:19""","""2024-01-27""","""12:35:19""",8m,"""Fair""",480.0
"""0A0D9E15EE50B171""","""classic_bike""",2024-01-29 16:26:17,2024-01-29 16:56:06,"""Wells St & Randolph St""","""Larrabee St & Webster Ave""","""41.884295""","""-87.633963""","""41.921822""","""-87.64414""","""member""","""2024-01-29""","""16:26:17""","""2024-01-29""","""16:56:06""",29m 49s,"""Fair""",1789.0
"""33FFC9805E3EFF9A""","""classic_bike""",2024-01-31 05:43:23,2024-01-31 06:09:35,"""Lincoln Ave & Waveland Ave""","""Kingsbury St & Kinzie St""","""41.948797""","""-87.675278""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-31""","""05:43:23""","""2024-01-31""","""06:09:35""",26m 12s,"""Fair""",1572.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""BD56BA20F42E4794""","""electric_bike""",2024-12-11 08:23:46,2024-12-11 08:37:34,"""Clybourn Ave & Division St""","""Dearborn St & Monroe St""","""41.904633522""","""-87.64051795""","""41.88""","""-87.63""","""member""","""2024-12-11""","""08:23:46""","""2024-12-11""","""08:37:34""",13m 48s,"""Fair""",828.0
"""3074643A6B60B300""","""electric_bike""",2024-12-09 12:26:15,2024-12-09 12:37:32,"""Canal St & Jackson Blvd""","""Mies van der Rohe Way & Chestn…","""41.878125""","""-87.639968""","""41.9""","""-87.62""","""member""","""2024-12-09""","""12:26:15""","""2024-12-09""","""12:37:32""",11m 17s,"""Fair""",677.0
"""15602635C5DF484E""","""electric_bike""",2024-12-31 17:10:03,2024-12-31 17:17:21,"""Albany Ave & Bloomingdale Ave""","""California Ave & Milwaukee Ave""","""41.91402671273""","""-87.705126462""","""41.922695""","""-87.697153""","""member""","""2024-12-31""","""17:10:03""","""2024-12-31""","""17:17:21""",7m 18s,"""Fair""",438.0
"""F15ABBA961560B75""","""electric_bike""",2024-12-01 14:39:47,2024-12-01 14:45:21,"""Albany Ave & Bloomingdale Ave""","""California Ave & Milwaukee Ave""","""41.914002657""","""-87.705099225""","""41.922695""","""-87.697153""","""member""","""2024-12-01""","""14:39:47""","""2024-12-01""","""14:45:21""",5m 34s,"""Fair""",334.0


In [52]:
df_cleaned = df_cleaned.drop("duration_seconds")


In [54]:
df_cleaned = df_cleaned.with_columns([
    df_cleaned["started_at"].cast(pl.Utf8).alias("started_at"),
    df_cleaned["ended_at"].cast(pl.Utf8).alias("ended_at"),
    df_cleaned["duration"].dt.total_seconds().cast(pl.Utf8).alias("duration")  # Convert to seconds, then string
])

In [55]:
df_cleaned

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""C1D650626C8C899A""","""electric_bike""","""2024-01-12 15:30:27.000000""","""2024-01-12 15:37:59.000000""","""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.903267384""","""-87.634736776""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-12""","""15:30:27""","""2024-01-12""","""15:37:59""","""452""","""Fair"""
"""EECD38BDB25BFCB0""","""electric_bike""","""2024-01-08 15:45:46.000000""","""2024-01-08 15:52:59.000000""","""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.9029365""","""-87.63444016666666""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-08""","""15:45:46""","""2024-01-08""","""15:52:59""","""433""","""Fair"""
"""F4A9CE78061F17F7""","""electric_bike""","""2024-01-27 12:27:19.000000""","""2024-01-27 12:35:19.000000""","""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.902951333333334""","""-87.63447033333334""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-27""","""12:27:19""","""2024-01-27""","""12:35:19""","""480""","""Fair"""
"""0A0D9E15EE50B171""","""classic_bike""","""2024-01-29 16:26:17.000000""","""2024-01-29 16:56:06.000000""","""Wells St & Randolph St""","""Larrabee St & Webster Ave""","""41.884295""","""-87.633963""","""41.921822""","""-87.64414""","""member""","""2024-01-29""","""16:26:17""","""2024-01-29""","""16:56:06""","""1789""","""Fair"""
"""33FFC9805E3EFF9A""","""classic_bike""","""2024-01-31 05:43:23.000000""","""2024-01-31 06:09:35.000000""","""Lincoln Ave & Waveland Ave""","""Kingsbury St & Kinzie St""","""41.948797""","""-87.675278""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-31""","""05:43:23""","""2024-01-31""","""06:09:35""","""1572""","""Fair"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""BD56BA20F42E4794""","""electric_bike""","""2024-12-11 08:23:46.000000""","""2024-12-11 08:37:34.000000""","""Clybourn Ave & Division St""","""Dearborn St & Monroe St""","""41.904633522""","""-87.64051795""","""41.88""","""-87.63""","""member""","""2024-12-11""","""08:23:46""","""2024-12-11""","""08:37:34""","""828""","""Fair"""
"""3074643A6B60B300""","""electric_bike""","""2024-12-09 12:26:15.000000""","""2024-12-09 12:37:32.000000""","""Canal St & Jackson Blvd""","""Mies van der Rohe Way & Chestn…","""41.878125""","""-87.639968""","""41.9""","""-87.62""","""member""","""2024-12-09""","""12:26:15""","""2024-12-09""","""12:37:32""","""677""","""Fair"""
"""15602635C5DF484E""","""electric_bike""","""2024-12-31 17:10:03.000000""","""2024-12-31 17:17:21.000000""","""Albany Ave & Bloomingdale Ave""","""California Ave & Milwaukee Ave""","""41.91402671273""","""-87.705126462""","""41.922695""","""-87.697153""","""member""","""2024-12-31""","""17:10:03""","""2024-12-31""","""17:17:21""","""438""","""Fair"""
"""F15ABBA961560B75""","""electric_bike""","""2024-12-01 14:39:47.000000""","""2024-12-01 14:45:21.000000""","""Albany Ave & Bloomingdale Ave""","""California Ave & Milwaukee Ave""","""41.914002657""","""-87.705099225""","""41.922695""","""-87.697153""","""member""","""2024-12-01""","""14:39:47""","""2024-12-01""","""14:45:21""","""334""","""Fair"""


In [58]:
df_cleaned = df_cleaned.with_columns([
    df_cleaned["started_at"].str.slice(0, 19).alias("started_at"),  # Keep only YYYY-MM-DD HH:MM:SS
    df_cleaned["ended_at"].str.slice(0, 19).alias("ended_at")
])


In [60]:
df_cleaned.write_csv("cleaned_cyclistic_data_2024.csv")
print("File saved as cleaned_cyclistic_data_2024.csv")

File saved as cleaned_cyclistic_data_2024.csv


In [82]:
pl.read_csv(r"C:\Users\Sowjanya\Documents\BIIKE-SHARE-DATA\cleaned_cyclistic_data_2024.csv")

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity
str,str,str,str,str,str,f64,f64,f64,f64,str,str,str,str,str,i64,str
"""C1D650626C8C899A""","""electric_bike""","""2024-01-12 15:30:27""","""2024-01-12 15:37:59""","""Wells St & Elm St""","""Kingsbury St & Kinzie St""",41.903267,-87.634737,41.889177,-87.638506,"""member""","""2024-01-12""","""15:30:27""","""2024-01-12""","""15:37:59""",452,"""Fair"""
"""EECD38BDB25BFCB0""","""electric_bike""","""2024-01-08 15:45:46""","""2024-01-08 15:52:59""","""Wells St & Elm St""","""Kingsbury St & Kinzie St""",41.902937,-87.63444,41.889177,-87.638506,"""member""","""2024-01-08""","""15:45:46""","""2024-01-08""","""15:52:59""",433,"""Fair"""
"""F4A9CE78061F17F7""","""electric_bike""","""2024-01-27 12:27:19""","""2024-01-27 12:35:19""","""Wells St & Elm St""","""Kingsbury St & Kinzie St""",41.902951,-87.63447,41.889177,-87.638506,"""member""","""2024-01-27""","""12:27:19""","""2024-01-27""","""12:35:19""",480,"""Fair"""
"""0A0D9E15EE50B171""","""classic_bike""","""2024-01-29 16:26:17""","""2024-01-29 16:56:06""","""Wells St & Randolph St""","""Larrabee St & Webster Ave""",41.884295,-87.633963,41.921822,-87.64414,"""member""","""2024-01-29""","""16:26:17""","""2024-01-29""","""16:56:06""",1789,"""Fair"""
"""33FFC9805E3EFF9A""","""classic_bike""","""2024-01-31 05:43:23""","""2024-01-31 06:09:35""","""Lincoln Ave & Waveland Ave""","""Kingsbury St & Kinzie St""",41.948797,-87.675278,41.889177,-87.638506,"""member""","""2024-01-31""","""05:43:23""","""2024-01-31""","""06:09:35""",1572,"""Fair"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""BD56BA20F42E4794""","""electric_bike""","""2024-12-11 08:23:46""","""2024-12-11 08:37:34""","""Clybourn Ave & Division St""","""Dearborn St & Monroe St""",41.904634,-87.640518,41.88,-87.63,"""member""","""2024-12-11""","""08:23:46""","""2024-12-11""","""08:37:34""",828,"""Fair"""
"""3074643A6B60B300""","""electric_bike""","""2024-12-09 12:26:15""","""2024-12-09 12:37:32""","""Canal St & Jackson Blvd""","""Mies van der Rohe Way & Chestn…",41.878125,-87.639968,41.9,-87.62,"""member""","""2024-12-09""","""12:26:15""","""2024-12-09""","""12:37:32""",677,"""Fair"""
"""15602635C5DF484E""","""electric_bike""","""2024-12-31 17:10:03""","""2024-12-31 17:17:21""","""Albany Ave & Bloomingdale Ave""","""California Ave & Milwaukee Ave""",41.914027,-87.705126,41.922695,-87.697153,"""member""","""2024-12-31""","""17:10:03""","""2024-12-31""","""17:17:21""",438,"""Fair"""
"""F15ABBA961560B75""","""electric_bike""","""2024-12-01 14:39:47""","""2024-12-01 14:45:21""","""Albany Ave & Bloomingdale Ave""","""California Ave & Milwaukee Ave""",41.914003,-87.705099,41.922695,-87.697153,"""member""","""2024-12-01""","""14:39:47""","""2024-12-01""","""14:45:21""",334,"""Fair"""


In [None]:
pip install sqlalchemy

In [None]:
from sqlalchemy import create_engine

In [None]:
pip install sqlalchemy mysql-connector-python pymysql


In [None]:
print(len(df_cleaned.columns))  # Should match the number of %s placeholders


In [84]:
df_cleaned

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""C1D650626C8C899A""","""electric_bike""","""2024-01-12 15:30:27""","""2024-01-12 15:37:59""","""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.903267384""","""-87.634736776""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-12""","""15:30:27""","""2024-01-12""","""15:37:59""","""452""","""Fair"""
"""EECD38BDB25BFCB0""","""electric_bike""","""2024-01-08 15:45:46""","""2024-01-08 15:52:59""","""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.9029365""","""-87.63444016666666""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-08""","""15:45:46""","""2024-01-08""","""15:52:59""","""433""","""Fair"""
"""F4A9CE78061F17F7""","""electric_bike""","""2024-01-27 12:27:19""","""2024-01-27 12:35:19""","""Wells St & Elm St""","""Kingsbury St & Kinzie St""","""41.902951333333334""","""-87.63447033333334""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-27""","""12:27:19""","""2024-01-27""","""12:35:19""","""480""","""Fair"""
"""0A0D9E15EE50B171""","""classic_bike""","""2024-01-29 16:26:17""","""2024-01-29 16:56:06""","""Wells St & Randolph St""","""Larrabee St & Webster Ave""","""41.884295""","""-87.633963""","""41.921822""","""-87.64414""","""member""","""2024-01-29""","""16:26:17""","""2024-01-29""","""16:56:06""","""1789""","""Fair"""
"""33FFC9805E3EFF9A""","""classic_bike""","""2024-01-31 05:43:23""","""2024-01-31 06:09:35""","""Lincoln Ave & Waveland Ave""","""Kingsbury St & Kinzie St""","""41.948797""","""-87.675278""","""41.88917683258""","""-87.6385057718""","""member""","""2024-01-31""","""05:43:23""","""2024-01-31""","""06:09:35""","""1572""","""Fair"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""BD56BA20F42E4794""","""electric_bike""","""2024-12-11 08:23:46""","""2024-12-11 08:37:34""","""Clybourn Ave & Division St""","""Dearborn St & Monroe St""","""41.904633522""","""-87.64051795""","""41.88""","""-87.63""","""member""","""2024-12-11""","""08:23:46""","""2024-12-11""","""08:37:34""","""828""","""Fair"""
"""3074643A6B60B300""","""electric_bike""","""2024-12-09 12:26:15""","""2024-12-09 12:37:32""","""Canal St & Jackson Blvd""","""Mies van der Rohe Way & Chestn…","""41.878125""","""-87.639968""","""41.9""","""-87.62""","""member""","""2024-12-09""","""12:26:15""","""2024-12-09""","""12:37:32""","""677""","""Fair"""
"""15602635C5DF484E""","""electric_bike""","""2024-12-31 17:10:03""","""2024-12-31 17:17:21""","""Albany Ave & Bloomingdale Ave""","""California Ave & Milwaukee Ave""","""41.91402671273""","""-87.705126462""","""41.922695""","""-87.697153""","""member""","""2024-12-31""","""17:10:03""","""2024-12-31""","""17:17:21""","""438""","""Fair"""
"""F15ABBA961560B75""","""electric_bike""","""2024-12-01 14:39:47""","""2024-12-01 14:45:21""","""Albany Ave & Bloomingdale Ave""","""California Ave & Milwaukee Ave""","""41.914002657""","""-87.705099225""","""41.922695""","""-87.697153""","""member""","""2024-12-01""","""14:39:47""","""2024-12-01""","""14:45:21""","""334""","""Fair"""


In [86]:
df_cleaned = df_cleaned.unique(subset=["ride_id"])


In [88]:
df_cleaned

ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,start_date,start_time,end_date,end_time,duration,ride_validity
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""9FC44CCF78FB54C7""","""electric_bike""","""2024-05-31 16:39:07""","""2024-05-31 17:10:47""","""Damen Ave & Walnut (Lake) St""","""Clarendon Ave & Junior Ter""","""41.885957003""","""-87.676992774""","""41.961004""","""-87.649603""","""member""","""2024-05-31""","""16:39:07""","""2024-05-31""","""17:10:47""","""1900""","""Fair"""
"""F88A0488A85E001C""","""classic_bike""","""2024-10-05 15:12:20""","""2024-10-05 15:22:16""","""Ashland Ave & Lake St""","""Sangamon St & Lake St""","""41.88592""","""-87.66717""","""41.88577925240433""","""-87.65102460980414""","""casual""","""2024-10-05""","""15:12:20""","""2024-10-05""","""15:22:16""","""596""","""Fair"""
"""F2202F477D16B66B""","""electric_bike""","""2024-04-13 17:00:11""","""2024-04-13 17:21:29""","""Western Ave & Winnebago Ave""","""State St & Randolph St""","""41.915694952""","""-87.687241673""","""41.88462107257936""","""-87.62783423066139""","""member""","""2024-04-13""","""17:00:11""","""2024-04-13""","""17:21:29""","""1278""","""Fair"""
"""164489DD07880813""","""classic_bike""","""2024-05-12 10:13:03""","""2024-05-12 10:30:23""","""Blackstone Ave & Hyde Park Blv…","""Ellis Ave & 60th St""","""41.802562""","""-87.590368""","""41.78509714636""","""-87.6010727606""","""casual""","""2024-05-12""","""10:13:03""","""2024-05-12""","""10:30:23""","""1040""","""Fair"""
"""571F1056CDA2F7F9""","""electric_bike""","""2024-05-31 23:13:18""","""2024-05-31 23:19:38""","""Ashland Ave & Augusta Blvd""","""Western Ave & Walton St""","""41.9""","""-87.67""","""41.89841768945""","""-87.6865960164""","""casual""","""2024-05-31""","""23:13:18""","""2024-05-31""","""23:19:38""","""380""","""Fair"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""3DB400ABF1DD8FD7""","""electric_bike""","""2024-06-10 18:13:23""","""2024-06-10 18:18:25""","""Clinton St & Lake St""","""N Damen Ave & W Chicago Ave""","""41.9""","""-87.68""","""41.895809""","""-87.678519""","""member""","""2024-06-10""","""18:13:23""","""2024-06-10""","""18:18:25""","""302""","""Fair"""
"""34FBB2D15D11F3C7""","""electric_bike""","""2024-08-22 13:51:32""","""2024-08-22 13:57:12""","""Oakley Ave & Irving Park Rd""","""Ashland Ave & Belle Plaine Ave""","""41.95434085219""","""-87.6860796243""","""41.956057""","""-87.668835""","""casual""","""2024-08-22""","""13:51:32""","""2024-08-22""","""13:57:12""","""340""","""Fair"""
"""690EEC3E38CB0399""","""classic_bike""","""2024-05-25 13:35:15""","""2024-05-25 14:13:41""","""Fort Dearborn Dr & 31st St*""","""Streeter Dr & Grand Ave""","""41.83867632897121""","""-87.60883212089539""","""41.892278""","""-87.612043""","""casual""","""2024-05-25""","""13:35:15""","""2024-05-25""","""14:13:41""","""2306""","""Fair"""
"""91A180A0D541501B""","""classic_bike""","""2024-11-20 16:41:23""","""2024-11-20 17:16:51""","""Damen Ave & Grand Ave""","""Kedzie Ave & Milwaukee Ave""","""41.892394452962705""","""-87.67688512802124""","""41.928999""","""-87.70803""","""member""","""2024-11-20""","""16:41:23""","""2024-11-20""","""17:16:51""","""2128""","""Fair"""


In [90]:
import mysql.connector

# Connect to MySQL database
connection = mysql.connector.connect(
    host="localhost",
    port=3305,
    user="root",
    password="root",
    database="bike_share"
)
cursor = connection.cursor()

# Create table query
create_table_query = """
CREATE TABLE IF NOT EXISTS bike_rides_2024 (
    ride_id TEXT,
    rideable_type TEXT,
    started_at TEXT,
    ended_at TEXT,
    start_station_name TEXT,
    end_station_name TEXT,
    start_lat DOUBLE,
    start_lng DOUBLE,
    end_lat DOUBLE,
    end_lng DOUBLE,
    member_casual TEXT,
    start_date TEXT,
    start_time TEXT,
    end_date TEXT,
    end_time TEXT,
    duration INT,
    ride_validity TEXT
);
"""

# Execute the query
cursor.execute(create_table_query)
connection.commit()

print("✅ Table 'bike_rides_2024' created successfully!")

# Close connection
cursor.close()
connection.close()


✅ Table 'bike_rides_2024' created successfully!


In [92]:
import polars as pl
import mysql.connector

# Assuming df_cleaned is your cleaned Polars DataFrame
df_cleaned = df_cleaned.unique()  # Ensure unique records

# Convert Polars DataFrame to a list of tuples
data = [tuple(row) for row in df_cleaned.to_numpy()]

# Connect to MySQL without specifying the database first
connection = mysql.connector.connect(
    host="localhost",
    port=3305,
    user="root",
    password="root"
)
cursor = connection.cursor()

# Create database if not exists
cursor.execute("CREATE DATABASE IF NOT EXISTS bike_share")

# Connect to the database
connection.database = "bike_share"

print("✅ Database connected successfully!")

# Ensure the `bike_rides_2021` table exists (Modify the schema if needed)
cursor.execute("""
CREATE TABLE IF NOT EXISTS bike_rides_2024 (
    ride_id TEXT PRIMARY KEY,
    rideable_type TEXT,
    started_at TEXT,
    ended_at TEXT,
    start_station_name TEXT,
    end_station_name TEXT,
    start_lat DOUBLE,
    start_lng DOUBLE,
    end_lat DOUBLE,
    end_lng DOUBLE,
    member_casual TEXT,
    start_date TEXT,
    start_time TEXT,
    end_date TEXT,
    end_time TEXT,
    duration INT,
    ride_validity TEXT
)
""")

print("✅ Table checked/created successfully!")

# SQL Insert Query (Ensure column names match your table)
insert_stmt = """
INSERT IGNORE INTO bike_rides_2024 (
    ride_id, rideable_type, started_at, ended_at, start_station_name, 
    end_station_name, start_lat, start_lng, end_lat, end_lng, 
    member_casual, start_date, start_time, end_date, end_time, 
    duration, ride_validity
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# Insert data in batches for efficiency
batch_size = 1000
for i in range(0, len(data), batch_size):
    batch = data[i:i + batch_size]
    cursor.executemany(insert_stmt, batch)
    connection.commit()

print(f"✅ Inserted {len(data)} unique records successfully into `bike_rides_2022`!")

# Close connection
cursor.close()
connection.close()


✅ Database connected successfully!
✅ Table checked/created successfully!
✅ Inserted 5853165 unique records successfully into `bike_rides_2022`!
