<a href="https://colab.research.google.com/github/adityasarve/Ai-project/blob/main/Feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Feature Engineering of pickup and delivery dataset.

In [2]:
import pandas as pd
import numpy as np
import geopy.distance
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant




In [3]:
delivery_data = pd.read_csv('/content/cleaned_delivery_data .csv')
pickup_data = pd.read_csv('/content/cleaned_pickup_data.csv')


print("Delivery Data Preview:")
print(delivery_data.head())

print("\nPickup Data Preview:")
print(pickup_data.head())


print("\nDelivery Data Types:")
print(delivery_data.dtypes)

print("\nPickup Data Types:")
print(pickup_data.dtypes)

Delivery Data Preview:
   order_id  region_id  courier_id        lng       lat  aoi_id  aoi_type  \
0   2031782         10          73  108.71571  30.90228      50        14   
1   4481765         10        3605  108.71605  30.90410      50        14   
2   3098203         10        1635  108.71797  30.94364     296        14   
3    356619         10        1635  108.71979  30.94130     296        14   
4   1484207         10        1635  108.72106  30.94164     296        14   

           accept_time      accept_gps_time  accept_gps_lng  accept_gps_lat  \
0  2024-10-22 10:26:00  2024-10-22 10:26:00       108.71826        30.95587   
1  2024-09-30 10:00:00  2024-09-30 10:00:00       108.71824        30.95583   
2  2024-07-10 08:33:00  2024-07-10 08:33:00       108.71801        30.95637   
3  2024-09-09 09:04:00  2024-09-09 09:04:00       108.71803        30.95629   
4  2024-10-19 08:29:00  2024-10-19 08:29:00       108.71820        30.95598   

         delivery_time    delivery_gps_

###  Feature Selection

In [3]:
# Define irrelevant columns for both datasets
irrelevant_columns = ['order_id', 'courier_id']

# Remove irrelevant columns from both datasets
delivery_data.drop(columns=irrelevant_columns, inplace=True)
pickup_data.drop(columns=irrelevant_columns, inplace=True)

# Check the modified datasets
print("\nModified Delivery Data:")
print(delivery_data.head())

print("\nModified Pickup Data:")
print(pickup_data.head())


Modified Delivery Data:
   region_id        lng       lat  aoi_id  aoi_type          accept_time  \
0         10  108.71571  30.90228      50        14  2024-10-22 10:26:00   
1         10  108.71605  30.90410      50        14  2024-09-30 10:00:00   
2         10  108.71797  30.94364     296        14  2024-07-10 08:33:00   
3         10  108.71979  30.94130     296        14  2024-09-09 09:04:00   
4         10  108.72106  30.94164     296        14  2024-10-19 08:29:00   

       accept_gps_time  accept_gps_lng  accept_gps_lat        delivery_time  \
0  2024-10-22 10:26:00       108.71826        30.95587  2024-10-22 17:04:00   
1  2024-09-30 10:00:00       108.71824        30.95583  2024-09-30 16:38:00   
2  2024-07-10 08:33:00       108.71801        30.95637  2024-07-10 13:24:00   
3  2024-09-09 09:04:00       108.71803        30.95629  2024-09-09 10:49:00   
4  2024-10-19 08:29:00       108.71820        30.95598  2024-10-19 10:11:00   

     delivery_gps_time  delivery_gps_lng  d

In [4]:

# Define the VIF calculation function
def calculate_vif(dataframe):
    # Replace infinite values with NaN and drop rows with NaN
    dataframe = dataframe.replace([np.inf, -np.inf], np.nan)
    dataframe = dataframe.dropna()

    X = add_constant(dataframe)
    vif = pd.DataFrame()
    vif['feature'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif


In [5]:
# Handle Multicollinearity

numerical_columns_delivery = delivery_data.select_dtypes(include=[np.number]).columns
numerical_columns_pickup = pickup_data.select_dtypes(include=[np.number]).columns


vif_delivery = calculate_vif(delivery_data[numerical_columns_delivery])
print("VIF for Delivery Data:")
print(vif_delivery)

# Define the VIF calculation function
def calculate_vif(dataframe):
    # Replace infinite values with NaN
    dataframe = dataframe.replace([np.inf, -np.inf], np.nan)

    # Drop columns with all NaN values instead of rows
    dataframe = dataframe.dropna(axis=1, how='all')

    # If the dataframe is empty after dropping columns, return an empty DataFrame
    if dataframe.empty:
        return pd.DataFrame(columns=['feature', 'VIF'])

    X = add_constant(dataframe)
    vif = pd.DataFrame()
    vif['feature'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

vif_pickup = calculate_vif(pickup_data[numerical_columns_pickup])
print("VIF for Pickup Data:")
print(vif_pickup)

VIF for Delivery Data:
             feature           VIF
0              const   1361.996367
1          region_id      1.580925
2                lng  17499.902183
3                lat   3603.483845
4             aoi_id      1.000729
5           aoi_type      1.019643
6     accept_gps_lng   4976.165787
7     accept_gps_lat   2359.159804
8   delivery_gps_lng   6612.197891
9   delivery_gps_lat   3000.599086
10                ds      1.017373
11          distance     78.459075
12              city      2.747946
VIF for Pickup Data:
           feature           VIF
0            const    464.486956
1        region_id      1.600291
2              lng  13989.626976
3              lat  10370.463524
4           aoi_id      1.000652
5         aoi_type      1.012318
6   pickup_gps_lng  16580.683304
7   pickup_gps_lat  14078.007563
8   accept_gps_lng  12340.834776
9   accept_gps_lat  12960.479735
10              ds      1.012086
11        distance      1.026111


In [15]:





delivery_data.dtypes

Unnamed: 0,0
order_id,int64
region_id,int64
courier_id,int64
lng,float64
lat,float64
aoi_id,int64
aoi_type,int64
accept_time,object
accept_gps_time,object
accept_gps_lng,float64


In [None]:
# Feature Importance
X_delivery = delivery_data.drop(columns=['delivery_duration'])

# Convert to datetime objects first
for col in X_delivery.select_dtypes(include=['object']).columns:
    try:
        X_delivery[col] = pd.to_datetime(X_delivery[col])
    except ValueError:
        pass

datetime_cols = X_delivery.select_dtypes(include=['datetime64']).columns
for col in datetime_cols:
    X_delivery[col + '_unix'] = X_delivery[col].astype(np.int64) // 10**9

# Drop the original datetime columns
X_delivery = X_delivery.drop(columns=datetime_cols)

# Convert any remaining object columns to numerical using one-hot encoding
X_delivery = pd.get_dummies(X_delivery, columns=X_delivery.select_dtypes(include=['object']).columns)


# Convert 'delivery_duration' to numerical (seconds)
y_delivery = pd.to_timedelta(delivery_data['delivery_duration']).dt.total_seconds()

model_delivery = RandomForestRegressor()
model_delivery.fit(X_delivery, y_delivery)

# Feature importance for delivery data
importances_delivery = model_delivery.feature_importances_
feature_names_delivery = X_delivery.columns

# For Pickup Data
X_pickup = pickup_data.drop(columns=['pickup_duration', 'accept_time', 'pickup_time'])  # Drop date/time columns if present
y_pickup = pickup_data['pickup_duration']

### Create New Features

In [8]:
import geopy.distance

# Function to calculate distance for delivery data
def calculate_distance_delivery(row):
    coord1 = (row['accept_gps_lat'], row['accept_gps_lng'])
    coord2 = (row['delivery_gps_lat'], row['delivery_gps_lng'])
    return geopy.distance.distance(coord1, coord2).km

# Function to calculate distance for pickup data
def calculate_distance_pickup(row):
    coord1 = (row['accept_gps_lat'], row['accept_gps_lng'])
    coord2 = (row['pickup_gps_lat'], row['pickup_gps_lng'])  # Use pickup coordinates
    return geopy.distance.distance(coord1, coord2).km

# Create new distance feature for delivery data
delivery_data['distance_to_delivery'] = delivery_data.apply(calculate_distance_delivery, axis=1)

# Create new distance feature for pickup data
pickup_data['distance_to_pickup'] = pickup_data.apply(calculate_distance_pickup, axis=1)

# Check new features
print("\nModified Delivery Data with New Distance Feature:")
print(delivery_data[['distance_to_delivery']].head())

print("\nModified Pickup Data with New Distance Feature:")
print(pickup_data[['distance_to_pickup']].head())


Modified Delivery Data with New Distance Feature:
   distance_to_delivery
0              5.365747
1              5.740989
2              1.526680
3              1.553725
4              1.617232

Modified Pickup Data with New Distance Feature:
   distance_to_pickup
0            0.520530
1            4.596794
2            4.623857
3            0.688644
4            0.554804


### Feature Transformation

In [9]:
# Normalization and Standardization
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Min-Max Scaling
scaler = MinMaxScaler()

delivery_data[['lat', 'lng', 'distance_to_delivery']] = scaler.fit_transform(delivery_data[['lat', 'lng', 'distance_to_delivery']])
pickup_data[['lat', 'lng', 'distance_to_pickup']] = scaler.fit_transform(pickup_data[['lat', 'lng', 'distance_to_pickup']])

# Standard Scaling
scaler = StandardScaler()

delivery_data[['lat', 'lng', 'distance_to_delivery']] = scaler.fit_transform(delivery_data[['lat', 'lng', 'distance_to_delivery']])
pickup_data[['lat', 'lng', 'distance_to_pickup']] = scaler.fit_transform(pickup_data[['lat', 'lng', 'distance_to_pickup']])

print("\nScaled Delivery Data:")
print(delivery_data[['lat', 'lng', 'distance_to_delivery']].head())

print("\nScaled Pickup Data:")
print(pickup_data[['lat', 'lng', 'distance_to_pickup']].head())


Scaled Delivery Data:
        lat       lng  distance_to_delivery
0  0.040747 -1.645537              0.022943
1  0.041729 -1.645477              0.027129
2  0.063061 -1.645139             -0.019886
3  0.061799 -1.644818             -0.019585
4  0.061982 -1.644595             -0.018876

Scaled Pickup Data:
        lat       lng  distance_to_pickup
0 -0.825953 -2.130078           -0.110476
1 -0.825910 -2.130068            0.499312
2 -0.825923 -2.130089            0.503360
3 -0.821809 -2.130092           -0.085327
4 -0.821862 -2.130249           -0.105348


### Feature Scaling

In [10]:
# Robust Scaling (Outlier Handling)
from sklearn.preprocessing import RobustScaler

robust_scaler = RobustScaler()

delivery_data[['lat', 'lng', 'distance_to_delivery']] = robust_scaler.fit_transform(delivery_data[['lat', 'lng', 'distance_to_delivery']])
pickup_data[['lat', 'lng', 'distance_to_pickup']] = robust_scaler.fit_transform(pickup_data[['lat', 'lng', 'distance_to_pickup']])


print("\nRobust Scaled Delivery Data:")
print(delivery_data[['lat', 'lng', 'distance_to_delivery']].head())

print("\nRobust Scaled Pickup Data:")
print(pickup_data[['lat', 'lng', 'distance_to_pickup']].head())


Robust Scaled Delivery Data:
        lat       lng  distance_to_delivery
0  0.568303 -8.292497              1.761548
1  0.570050 -8.292252              1.952203
2  0.608006 -8.290871             -0.189027
3  0.605760 -8.289563             -0.175286
4  0.606086 -8.288649             -0.143019

Robust Scaled Pickup Data:
        lat       lng  distance_to_pickup
0 -0.223078 -9.346295           -0.068090
1 -0.223054 -9.346254            3.863163
2 -0.223061 -9.346335            3.889263
3 -0.220726 -9.346348            0.094043
4 -0.220757 -9.346949           -0.035035


### Feature Reduction

In [12]:
#Principal Component Analysis (PCA)

from sklearn.decomposition import PCA

pca = PCA(n_components=3)


principal_components_delivery = pca.fit_transform(delivery_data[['lat', 'lng', 'distance_to_delivery']])

# Add PCA components as new features
pca_df_delivery = pd.DataFrame(principal_components_delivery, columns=[f'PC{i+1}' for i in range(principal_components_delivery.shape[1])])
delivery_data = pd.concat([delivery_data, pca_df_delivery], axis=1)


principal_components_pickup = pca.fit_transform(pickup_data[['lat', 'lng', 'distance_to_pickup']])

# Add PCA components as new features
pca_df_pickup = pd.DataFrame(principal_components_pickup, columns=[f'PC{i+1}' for i in range(principal_components_pickup.shape[1])])
# Fix: Concatenate with pickup_data instead of c_pickup_data
pickup_data = pd.concat([pickup_data, pca_df_pickup], axis=1)

print("\nDelivery Data with PCA Features:")
print(delivery_data[['PC1', 'PC2', 'PC3']].head())

print("\nPickup Data with PCA Features:")
print(pickup_data[['PC1', 'PC2', 'PC3']].head())


Delivery Data with PCA Features:
        PC1       PC1       PC2       PC2       PC3       PC3
0  1.044479  1.044479 -6.568270 -6.568270  1.432373  1.432373
1  1.235135  1.235135 -6.567714 -6.567714  1.433917  1.433917
2 -0.906071 -0.906071 -6.558251 -6.558251  1.472124  1.472124
3 -0.892331 -0.892331 -6.557428 -6.557428  1.469651  1.469651
4 -0.860063 -0.860063 -6.556473 -6.556473  1.469765  1.469765

Pickup Data with PCA Features:
        PC1       PC2       PC3
0 -0.581444 -8.196125  0.157461
1  3.349307 -8.133310  0.157804
2  3.375406 -8.132975  0.157805
3 -0.419334 -8.193410  0.159823
4 -0.548386 -8.196072  0.159829
