In [1]:
import pandas as pd
from func import distance

def clean_data(input_file_path, output_file_path):
    # Read the input CSV file
    df = pd.read_csv(input_file_path)

    # Drop unnecessary columns
    df = df.drop(['key', 'Unnamed: 0'], axis=1)

    # Remove rows with missing values
    df.dropna(axis=0, inplace=True)

    # Convert pickup_datetime to datetime object and adjust time zone to Eastern Standard Time (EST)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S %Z')
    df['pickup_datetime'] = df['pickup_datetime'].dt.tz_convert('America/New_York').dt.tz_localize(None)

    # Extract month, weekday, day, hour, and minute from pickup_datetime
    df['month'] = df['pickup_datetime'].dt.month
    df['weekday'] = df['pickup_datetime'].dt.day_name()
    df['day'] = df['pickup_datetime'].dt.day
    df['hour'] = df['pickup_datetime'].dt.hour
    df['minute'] = df['pickup_datetime'].dt.minute
    
    #Order the dataframe
    df.sort_values(by='pickup_datetime', ascending=True, inplace=True, ignore_index=True)

    # Compute distance using haversine formula
    df['distance'] =  distance(df['pickup_longitude'].to_numpy(),df['pickup_latitude'].to_numpy(),
                            df['dropoff_longitude'].to_numpy(), df['dropoff_latitude'].to_numpy())

    # Remove rows with invalid values
    df.drop(df[df['passenger_count'] > 6].index, axis=0, inplace=True)
    df.drop(df[df['fare_amount'] < 2.5].index, axis=0, inplace=True)
    df.drop(df[df['distance'] > 400].index, axis=0, inplace=True)
    df.drop(df[df['distance'] == 0].index, axis=0, inplace=True)
    df.dropna(axis=0, inplace=True)

    # Save the cleaned data to a new CSV file
    df.to_csv(output_file_path, index=False)
    
    return df

In [None]:
import pandas as pd
import numpy as np
import folium
from folium.plugins import HeatMap
from sklearn.cluster import DBSCAN
import time


def get_hot_spots(max_distance, min_pickups, ride_data):
    ## get coordinates from ride data
    coords = ride_data[['pickup_latitude', 'pickup_longitude']]

    ## calculate epsilon parameter using
    ## the user defined distance
    kms_per_radian = 6371.0088
    epsilon = max_distance / kms_per_radian

    ## perform clustering
    db = DBSCAN(eps=epsilon, min_samples=min_pickups,
                algorithm='ball_tree', metric='haversine').fit(np.radians(coords))

    ## group the clusters
    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])

    ## report
    print('Number of clusters: {}'.format(num_clusters))

    ## initialize lists for hot spots
    lat = []
    lon = []
    num_members = []

    ## loop through clusters and get centroids, number of members
    for i in range(len(set(cluster_labels))):
        if not clusters[i].empty:
            ## get centroid and magnitude of cluster
            lat.append(clusters[i]['pickup_latitude'].mean())
            lon.append(clusters[i]['pickup_longitude'].mean())
            num_members.append(len(clusters[i]))

    hot_spots = [lon, lat, num_members]
    return hot_spots


def create_heat_map(month, day, hour, pickups):
    df = pd.read_csv('Data/uber_limpios.csv')

    # get ride data
    ride_data = df.loc[((df['month']== month) & (df['day']== day) & (df['hour'] > hour))]

    # maximum distance between two cluster members in kilometers
    max_distance = 0.005

    # minimum number of cluster members
    min_pickups = pickups

    # call the get_hot_spots function
    hot_spots = get_hot_spots(max_distance, min_pickups, ride_data)
    
    #time.sleep(30)
    
    df_hot = pd.DataFrame({ 'Lat': hot_spots[0][1],'Lon': hot_spots[0][0], 'Numero': hot_spots[0][2]})
    

    m = folium.Map(location=[40.750584, -73.873010], zoom_start=10)

    # Crea un mapa de calor utilizando las coordenadas de hot_spots
    heat_map = HeatMap(df_hot)

    # Agrega el mapa de calor al mapa
    heat_map.add_to(m)

    # Muestra el mapa
    return m


In [9]:
clean_data('Data/uber.csv','Data/uber_limpios.csv')

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,month,weekday,day,hour,minute,distance
0,8.5,2008-12-31 20:15:22,-73.981918,40.779456,-73.957685,40.771043,2,12,Wednesday,31,20,15,2.244765
1,13.0,2008-12-31 20:59:17,-73.983759,40.721389,-73.994833,40.687179,2,12,Wednesday,31,20,59,3.916842
2,10.6,2008-12-31 21:05:03,-73.956635,40.771254,-73.991528,40.749778,2,12,Wednesday,31,21,5,3.786736
3,12.2,2008-12-31 21:09:13,-73.984605,40.728020,-73.955746,40.776830,1,12,Wednesday,31,21,9,5.946957
4,11.0,2008-12-31 21:13:41,-73.980127,40.737425,-74.009544,40.726025,4,12,Wednesday,31,21,13,2.784022
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199994,18.5,2015-06-30 18:57:53,-73.971703,40.782207,-73.943680,40.827991,2,6,Tuesday,30,18,57,5.610774
199995,25.5,2015-06-30 19:16:42,-74.001099,40.730961,-73.957123,40.806908,2,6,Tuesday,30,19,16,9.221234
199996,20.0,2015-06-30 19:31:06,-73.999962,40.733135,-73.962448,40.773041,4,6,Tuesday,30,19,31,5.447442
199997,8.5,2015-06-30 19:33:33,-73.980988,40.762020,-73.960083,40.770531,1,6,Tuesday,30,19,33,1.998738


In [19]:
create_heat_map(3,21,15,25)

Number of clusters: 1


IndexError: list index out of range