In [2]:
# Imports
import pandas as pd # type: ignore
import numpy as np # type: ignore

# viz
import matplotlib.pyplot as plt
import seaborn as sns # type: ignore
import networkx as nx # type: ignore

# utils
import sys
sys.path.append('../')

from src.data.make_dataset import load_data

pd.set_option('display.max_columns', None)
plt.style.use('fivethirtyeight')

In [3]:
# Loading Data
data_dict = load_data()

olist_customers_df = data_dict['olist_customers_df']
olist_geolocation_df = data_dict['olist_geolocation_df']
olist_order_items_df = data_dict['olist_order_items_df']
olist_order_payments_df = data_dict['olist_order_payments_df']
olist_order_reviews_df = data_dict['olist_order_reviews_df']
olist_orders_df = data_dict['olist_orders_df']
olist_products_df = data_dict['olist_products_df']
olist_sellers_df = data_dict['olist_sellers_df']
product_category_name_translation_df = data_dict['product_category_name_translation_df']

olist_closed_deals_df = data_dict['olist_closed_deals_df']
olist_marketing_qualified_leads_df = data_dict['olist_marketing_qualified_leads_df']

In [4]:
# Change purchase timestamp to datetime format
olist_orders_df['order_purchase_timestamp'] = pd.to_datetime(olist_orders_df['order_purchase_timestamp'])
olist_orders_df['order_purchase_date'] = olist_orders_df['order_purchase_timestamp'].dt.date
olist_orders_df['order_purchase_month'] = olist_orders_df['order_purchase_timestamp'].dt.month
olist_orders_df['order_purchase_year'] = olist_orders_df['order_purchase_timestamp'].dt.year

In [5]:
# Merge into one dataset 
# ToDo: turn into function

df = olist_orders_df.merge(olist_order_items_df, on='order_id', how='left')
df = df.merge(olist_order_payments_df, on='order_id', how='outer', validate='m:m')
df = df.merge(olist_order_reviews_df, on='order_id', how='outer')
df = df.merge(olist_products_df, on='product_id', how='outer')
df = df.merge(olist_customers_df, on='customer_id', how='outer')
df = df.merge(olist_sellers_df, on='seller_id', how='outer')

# Joining marketing data
df = df.merge(olist_closed_deals_df, on='seller_id', how='left')
df = df.merge(olist_marketing_qualified_leads_df, on='mql_id', how='left')

# Add the latitude and longitude for each customer's zip codes
df = df.merge(olist_geolocation_df[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']], 
              left_on='customer_zip_code_prefix', 
              right_on='geolocation_zip_code_prefix', 
              how='left', suffixes=('', '_customer'))


In [7]:
# Add the latitude and longitude for each seller's zip codes
df = df.merge(olist_geolocation_df[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']], 
              left_on='seller_zip_code_prefix', 
              right_on='geolocation_zip_code_prefix', 
              how='left', suffixes=('_customer', '_seller'))

: 

In [1]:
df_small = df.head(1000)  # Get the first 1000 rows
df_small = df_small.merge(olist_geolocation_df[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']], 
                          left_on='seller_zip_code_prefix', 
                          right_on='geolocation_zip_code_prefix', 
                          how='left', suffixes=('_customer', '_seller'))

NameError: name 'df' is not defined

### Variables

In [10]:
# Seasonality
df['seasonality'] = df['order_purchase_timestamp'].dt.month

In [11]:
# Product Dimensions
df['product_dimensions'] = df['product_length_cm'] * df['product_height_cm'] * df['product_width_cm']

In [13]:
# Product Weight (in kg)
df['product_weight_kg'] = df['product_weight_g']/1000

In [12]:
# Has photos
df['has_photos'] = df['product_photos_qty'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
#Previous Review Score for the Same Product
df = df.sort_values(by=['product_id', 'order_purchase_timestamp'])

df['previous_review_score'] = df.groupby('product_id')['review_score'].shift(1)

In [16]:
pip install geopy

Collecting geopy
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.0-py3-none-any.whl.metadata (1.4 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.1
Note: you may need to restart the kernel to use updated packages.


In [17]:
# Distance (in km) 
from geopy.distance import geodesic

# Function to calculate distance between two sets of coordinates
def calculate_distance(row):
    customer_coords = (row['geolocation_lat_customer'], row['geolocation_lng_customer'])
    seller_coords = (row['geolocation_lat_seller'], row['geolocation_lng_seller'])
    
    # If either coordinate is missing, return NaN
    if None in customer_coords or None in seller_coords:
        return None
    
    return geodesic(customer_coords, seller_coords).kilometers

: 

In [None]:
# Apply the distance calculation
df['buyer_seller_distance_km'] = df.apply(calculate_distance, axis=1)