In [20]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go
import os

In [3]:
csv_folder = r"C:\Users\adela\OneDrive\Data Science Full Stack\Module 6 - ML Unsupervised\Projet Uber\uber-trip-data"

In [9]:
csv_files = [f for f in glob.glob(os.path.join(csv_folder, "*.csv")) if "uber-raw-data" in os.path.basename(f).lower()]

In [10]:
dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    
    # Trouver et renommer la colonne datetime
    for col in df.columns:
        if 'date' in col.lower():
            df.rename(columns={col: 'pickup_datetime'}, inplace=True)
            break
    
    dfs.append(df)

In [11]:
df_all = pd.concat(dfs, ignore_index=True)

In [12]:
df_all['pickup_datetime'] = pd.to_datetime(df_all['pickup_datetime'], errors='coerce')

In [13]:
print(f"✅ Données fusionnées : {df_all.shape[0]} lignes")
print(df_all.head())

✅ Données fusionnées : 18804806 lignes
      pickup_datetime      Lat      Lon    Base Dispatching_base_num  \
0 2014-04-01 00:11:00  40.7690 -73.9549  B02512                  NaN   
1 2014-04-01 00:17:00  40.7267 -74.0345  B02512                  NaN   
2 2014-04-01 00:21:00  40.7316 -73.9873  B02512                  NaN   
3 2014-04-01 00:28:00  40.7588 -73.9776  B02512                  NaN   
4 2014-04-01 00:33:00  40.7594 -73.9722  B02512                  NaN   

  Affiliated_base_num  locationID  
0                 NaN         NaN  
1                 NaN         NaN  
2                 NaN         NaN  
3                 NaN         NaN  
4                 NaN         NaN  


### Nettoyage et standardisation des colonnes

In [14]:
if 'Date/Time' in df_all.columns:
    df_all.rename(columns={'Date/Time': 'pickup_datetime'}, inplace=True)
elif 'Pickup_date' in df_all.columns:
    df_all.rename(columns={'Pickup_date': 'pickup_datetime'}, inplace=True)

In [15]:
df_all['pickup_datetime'] = pd.to_datetime(df_all['pickup_datetime'], errors='coerce')

In [16]:
col_lat = [col for col in df_all.columns if 'lat' in col.lower()][0]
col_lon = [col for col in df_all.columns if 'lon' in col.lower()][0]
df_all = df_all[['pickup_datetime', col_lat, col_lon]].dropna()
df_all.columns = ['pickup_datetime', 'Lat', 'Lon']

In [17]:
df_all['hour'] = df_all['pickup_datetime'].dt.hour
df_all['weekday'] = df_all['pickup_datetime'].dt.dayofweek  # 0 = lundi

In [18]:
sample = df_all[(df_all['weekday'] == 4) & (df_all['hour'] == 18)]
print(f"Nombre de trajets le vendredi à 18h: {sample.shape[0]}")

Nombre de trajets le vendredi à 18h: 54762


### Clustering avec KMeans

In [21]:
X = sample[['Lat', 'Lon']].values
kmeans = KMeans(n_clusters=10, random_state=42).fit(X)
sample['kmeans_cluster'] = kmeans.labels_
kmeans_centroids = kmeans.cluster_centers_

[WinError 2] Le fichier spécifié est introuvable
  File "c:\Users\adela\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\adela\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\adela\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\adela\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus

### Clustering avec DBSCAN

In [22]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
dbscan = DBSCAN(eps=0.15, min_samples=50).fit(X_scaled)
sample['dbscan_cluster'] = dbscan.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['dbscan_cluster'] = dbscan.labels_


### Visualisation avec Plotly

In [23]:
fig = px.scatter_mapbox(
    sample,
    lat="Lat",
    lon="Lon",
    color="kmeans_cluster",
    zoom=10,
    mapbox_style="carto-positron",
    title="Hot Zones - Vendredi 18h (KMeans)"
)

In [24]:
for i, (lat, lon) in enumerate(kmeans_centroids):
    fig.add_scattermapbox(
        lat=[lat],
        lon=[lon],
        mode="markers+text",
        marker=dict(size=14, color="red"),
        text=[f"Zone {i}"],
        textposition="top right"
    )

fig.show()

### DBScan pour comparaison

In [25]:
fig2 = px.scatter_mapbox(
    sample,
    lat="Lat",
    lon="Lon",
    color="dbscan_cluster",
    zoom=10,
    mapbox_style="carto-positron",
    title="Hot Zones - Vendredi 18h (DBSCAN)"
)
fig2.show()