In [343]:
bike_trips_data = pd.read_excel('bike_trip_details.xlsx')

In [326]:
def get_hex_id(lat,long,res=8):
    return h3.geo_to_h3(lat,long, res)


In [327]:
def add_extra_parameters(bike_trips_data):
    bike_trips_data['pick_hex_id'] = bike_trips_data.apply(lambda x: get_hex_id(x.pick_lat, x.pick_lng), axis=1)
    bike_trips_data['drop_hex_id'] = bike_trips_data.apply(lambda x: get_hex_id(x.drop_lat, x.drop_lng), axis=1)
    bike_trips_data['date_time']=bike_trips_data['timestamp'].apply(lambda x : datetime.fromtimestamp(x/1000))
    bike_trips_data['speed'] = (bike_trips_data['travel_distance']/bike_trips_data['travel_time'])*60
    return bike_trips_data

In [328]:
bike_trips_data = add_extra_parameters(bike_trips_data)

In [272]:
def top_five_hex_cluster(bike_trips_data):
    pairs_hex = bike_trips_data.groupby(['pick_hex_id','drop_hex_id'])['trip_id'].nunique().reset_index().sort_values(by='trip_id',ascending=False)
    pairs_hex.rename(columns={"trip_id":"Total trips"},inplace=True)
    pairs_hex['Hex pair(source_hex_id,destination_hex_id)'] = pairs_hex['pick_hex_id'].astype(str) + ' to ' \
                                                                  + pairs_hex['drop_hex_id'].astype(str) 
    top_pairs_hex = pairs_hex.head(5).reset_index(drop=True)
    top_pairs_hex['Rank'] = top_pairs_hex.index + 1
    return top_pairs_hex[['Rank','Hex pair(source_hex_id,destination_hex_id)','Total trips']]

In [336]:
def top_five_hex_cluster_with_no_trips_in_same_clusters(bike_trips_data):
    pairs_hex = bike_trips_data.groupby(['pick_hex_id','drop_hex_id'])['trip_id'].nunique().reset_index().sort_values(by='trip_id',ascending=False)
    pairs_hex.rename(columns={"trip_id":"Total trips"},inplace=True)
    pairs_hex = pairs_hex[pairs_hex['pick_hex_id'] != pairs_hex['drop_hex_id']].reset_index(drop=True)
    pairs_hex['Hex pair(source_hex_id,destination_hex_id)'] = pairs_hex['pick_hex_id'].astype(str) + ' to ' \
                                                                  + pairs_hex['drop_hex_id'].astype(str) 
    top_pairs_hex = pairs_hex.head(5).reset_index(drop=True)
    top_pairs_hex['Rank'] = top_pairs_hex.index + 1
    return top_pairs_hex[['Rank','Hex pair(source_hex_id,destination_hex_id)','Total trips']]

In [337]:
top_five_hex_cluster_with_no_trips_in_same_clusters(bike_trips_data)

Unnamed: 0,Rank,"Hex pair(source_hex_id,destination_hex_id)",Total trips
0,1,88586da335fffff to 88586da149fffff,762
1,2,88586da149fffff to 88586da335fffff,731
2,3,88586da331fffff to 88586da335fffff,697
3,4,88586da335fffff to 88586da331fffff,681
4,5,88586da149fffff to 88586da14dfffff,485


In [273]:
def get_mahalanobis_distance(x=None, data=None, cov=None):
    """Compute the Mahalanobis Distance between each row of x and the data  
    x    : vector or matrix of data with, say, p columns.
    data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.
    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
    """
    x_minus_mu = x - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = sp.linalg.inv(cov)
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    return mahal.diagonal()


In [16]:
top_five_hex_cluster(bike_trips_data)

Unnamed: 0,Rank,"Hex pair(source_hex_id,destination_hex_id)",Total trips
0,1,88586da335fffff to 88586da335fffff,1064
1,2,88586da335fffff to 88586da149fffff,762
2,3,88586da149fffff to 88586da335fffff,731
3,4,88586da331fffff to 88586da335fffff,697
4,5,88586da149fffff to 88586da149fffff,692


In [274]:
def avg_duration_between_first_and_second_trip(bike_trips_data):
    bike_trips_of_cus_more_than_one_trxn = bike_trips_data.copy()
    bike_trips_of_cus_more_than_one_trxn['lagged_date']  = bike_trips_data.sort_values(by=['date_time'], ascending=True).groupby(['customer_id'])['date_time'].shift(1)
    # gives you the data of the second ride of each cutomer which has previous ride time i.e first ride
    bike_trips_with_first_and_second_trxn_time = bike_trips_of_cus_more_than_one_trxn.sort_values(by= ['customer_id','date_time']).groupby('customer_id').nth(1).reset_index()
    bike_trips_with_first_and_second_trxn_time['time_diff'] = bike_trips_with_first_and_second_trxn_time['date_time'] - bike_trips_with_first_and_second_trxn_time['lagged_date']
    return bike_trips_with_first_and_second_trxn_time.time_diff.mean()


In [275]:
avg_duration_between_first_and_second_trip(bike_trips_data)

Timedelta('1 days 00:47:09.691580')

In [276]:
def remove_anomlies(df_in, col_name):
    """
    given the data frame removes the outlier as per quantile range 
    returns : the data frame with anamolies
    """
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    
    df_out = df_in.loc[(df_in[col_name] >= fence_low) & (df_in[col_name] <= fence_high)]
    return df_out.reset_index(drop=True)

In [277]:
def remove_outlier_and_missing_data(bike_trips_data):
    """
    Will remove all the outliers as seen in the EDA
    """
    # missing values
    bike_trips_data = bike_trips_data[(bike_trips_data.travel_distance >0) | (~bike_trips_data.travel_distance.isnull())]
    bike_trips_data['mahalanobis_distance'] = get_mahalanobis_distance(x=bike_trips_data[['travel_distance', 'trip_fare']], \
                                        data=bike_trips_data[['travel_distance', 'trip_fare']])
    bike_trips_data = bike_trips_data[bike_trips_data['mahalanobis_distance']  <= 100].reset_index(drop=True)
    bike_trips_data = remove_anomlies(bike_trips_data, 'speed')
    return bike_trips_data
    

In [281]:
bike_trips_data = remove_outlier_and_missing_data(bike_trips_data)

In [283]:

print(pearsonr(bike_trips_data['travel_distance'], bike_trips_data['travel_time']))


(0.8556113050483931, 0.0)


In [346]:
bike_trips_data['interaction_term'] = bike_trips_data['travel_distance'] * bike_trips_data['travel_time']

In [347]:
def model_to_predict_trip_fare(bike_trips_data):
    df_features = bike_trips_data[['travel_distance','travel_time','interaction_term']]
    df_target = bike_trips_data[['trip_fare']]
    # Linear Regression Model
    lm = linear_model.LinearRegression()
    # split of data into 75% train data and 25% test data
    X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, test_size=0.25, random_state=42)
    model = lm.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    rmse = sqrt(metrics.mean_squared_error(y_pred,y_test))
    print('-------------------------------------')
    print("Root mean square error(RMSE): {}".format(rmse))
    print("coefficient of determination R^2 of the prediction: {}".format(model.score(X_train,y_train)))
    print("Intercept: {}".format(model.intercept_))
    print("coefficients for travel_distance,travel_time,interaction_term  are: {}".format(model.coef_))
              
    return model

In [348]:
model_to_predict_trip_fare(bike_trips_data)

-------------------------------------
Root mean square error(RMSE): 5.4213712369187315
coefficient of determination R^2 of the prediction: 0.7684709870440589
Intercept: [16.94899108]
coefficients for travel_distance,travel_time,interaction_term  are: [[ 7.09280208 -0.38554088  0.06311175]]


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [317]:




print("The Root Mean Squared Error (RMSE) : {'rmse'}".format(rmse = sqrt(metrics.mean_squared_error(y_pred,y_test)))
print()

print(model.intercept_)

4.651946528080086
[19.95100727]


In [313]:
model.predict([[3.5,15,52.5]])

array([[38.31832616]])

5.379411955871469

In [255]:

# The pearsonr() SciPy function can be used to calculate the 
# Pearson’s correlation coefficient between two data samples with the same length.

model.predict([[3.5,15,52.5]])

array([[38.36611519]])

array([[ 6.75169355, -0.63388728,  0.08813868]])

array([19.61621605])