In [2]:
import pandas as pd
import numpy as np
from zipfile import ZipFile, Path
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'axes.facecolor':'white'}) #para dark mode no vscode

In [3]:
import seaborn as sns

In [4]:
LIST_OF_FILES = ['olist_customers_dataset.csv',
'olist_geolocation_dataset.csv',
'olist_order_items_dataset.csv',
'olist_order_payments_dataset.csv',
'olist_order_reviews_dataset.csv',
'olist_orders_dataset.csv',
'olist_products_dataset.csv',
'olist_sellers_dataset.csv',
'product_category_name_translation.csv']

In [5]:
# Each dataset is stored into a position, accessible by its name as key
dict_dataset = {}

# Read all files
with ZipFile("archive.zip") as myzip:
    for file_name in LIST_OF_FILES:
        with myzip.open(file_name) as file:
            dict_dataset[file_name.split('.')[0]] = pd.read_csv(file, header=0, delimiter=",")

In [6]:
pd.Series(dict_dataset.keys())

0              olist_customers_dataset
1            olist_geolocation_dataset
2            olist_order_items_dataset
3         olist_order_payments_dataset
4          olist_order_reviews_dataset
5                 olist_orders_dataset
6               olist_products_dataset
7                olist_sellers_dataset
8    product_category_name_translation
dtype: object

# FULL TABLE

In [7]:
df = dict_dataset['olist_geolocation_dataset']

In [8]:
%%time
df['LAT_LONG'] = df.apply(lambda row :  str(row['geolocation_lat']) + str(row['geolocation_lng']), axis=1)

Wall time: 45.1 s


In [9]:
df_lat_long_in_CEPs = df.groupby('geolocation_zip_code_prefix').apply(lambda group : group['LAT_LONG'].nunique())

In [10]:
df_lat_long_in_CEPs =  df_lat_long_in_CEPs.rename('diff_lat_long').reset_index()

In [11]:
df_lat_long_in_CEPs.sort_values('diff_lat_long', ascending=False).head(10)

Unnamed: 0,geolocation_zip_code_prefix,diff_lat_long
9536,38400,746
4501,11680,727
8728,35500,726
4528,11740,666
9015,36400,627
9719,39400,620
8579,35162,611
9542,38408,600
9239,37200,595
8867,35900,589


In [12]:
df[df['geolocation_zip_code_prefix'] == 38400].head(5)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state,LAT_LONG
645450,38400,-18.91596,-48.278974,uberlandia,MG,-18.91595976602804-48.278974174995646
645456,38400,-18.900442,-48.260759,uberlandia,MG,-18.900442445467096-48.26075855813959
645460,38400,-18.913015,-48.262132,uberlandia,MG,-18.913015152374623-48.26213189982101
645461,38400,-18.922381,-48.282111,uberlandia,MG,-18.92238123980033-48.28211103081535
645471,38400,-18.92216,-48.271125,uberlandia,MG,-18.922160210658436-48.27112484984707


In [13]:
LIST = []
for i in dict_dataset:
    LIST.append((i,dict_dataset[i].shape[0]))

In [14]:
pd.DataFrame(LIST, columns=['table', 'size']).sort_values('size', ascending=False)

Unnamed: 0,table,size
1,olist_geolocation_dataset,1000163
2,olist_order_items_dataset,112650
3,olist_order_payments_dataset,103886
0,olist_customers_dataset,99441
5,olist_orders_dataset,99441
4,olist_order_reviews_dataset,99224
6,olist_products_dataset,32951
7,olist_sellers_dataset,3095
8,product_category_name_translation,71


In [15]:
# olist_order_items_dataset

In [16]:
df_geolocation_dataset = dict_dataset['olist_geolocation_dataset']
df_orders_dataset = dict_dataset['olist_orders_dataset']
df_order_items_dataset = dict_dataset['olist_order_items_dataset']
df_order_payments_dataset = dict_dataset['olist_order_payments_dataset']
df_customers_dataset = dict_dataset['olist_customers_dataset']
df_order_reviews_dataset = dict_dataset['olist_order_reviews_dataset']
df_products_dataset = dict_dataset['olist_products_dataset']
df_sellers_dataset = dict_dataset['olist_sellers_dataset']
df_product_category_name_translation = dict_dataset['product_category_name_translation']

In [17]:
# <!-- ![backupPreview.png](attachment:backupPreview.png) -->

## A: order_items_dataset + products_dataset

In [18]:
A = df_order_items_dataset.merge(df_products_dataset, on='product_id', how='left')

In [19]:
A.head(2)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,cool_stuff,58.0,598.0,4.0,650.0,28.0,9.0,14.0
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93,pet_shop,56.0,239.0,2.0,30000.0,50.0,30.0,40.0


## B: sellers_dataset + geolocation_dataset

In [20]:
B = df_sellers_dataset.merge(df_geolocation_dataset.drop_duplicates('geolocation_zip_code_prefix'), \
                             how='left', \
                             left_on=['seller_zip_code_prefix'], \
                             right_on=['geolocation_zip_code_prefix'])

In [21]:
# List the columns in which we want to add a sufix
columns_to_add_sufix = ['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state', 'LAT_LONG']

# Create a function to do that to the given columns
func_modifier = lambda col : str(col + "_seller")

# Apply the function to the desired columns
new_columns_names = list(map(lambda x : func_modifier(x) if x in columns_to_add_sufix else x, B))

B.columns = new_columns_names

## C: A + B

In [22]:
C = A.merge(B, how='left', on='seller_id')

In [23]:
C.head(2)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,product_category_name,product_name_lenght,product_description_lenght,...,product_width_cm,seller_zip_code_prefix,seller_city,seller_state,geolocation_zip_code_prefix_seller,geolocation_lat_seller,geolocation_lng_seller,geolocation_city_seller,geolocation_state_seller,LAT_LONG_seller
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,cool_stuff,58.0,598.0,...,14.0,27277,volta redonda,SP,27277.0,-22.498183,-44.123614,volta redonda,RJ,-22.498182754371285-44.12361371579433
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93,pet_shop,56.0,239.0,...,40.0,3471,sao paulo,SP,3471.0,-23.566258,-46.518417,sao paulo,SP,-23.56625832001269-46.51841724632121


## D: customer_dataset + geolocation_dataset

In [24]:
D = df_customers_dataset.merge(df_geolocation_dataset.drop_duplicates('geolocation_zip_code_prefix'), how='left', left_on=['customer_zip_code_prefix'], right_on=['geolocation_zip_code_prefix'])

In [25]:
# List the columns in which we want to add a sufix
columns_to_add_sufix = ['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng', 'geolocation_city', 'geolocation_state']

# Create a function to do that to the given columns
func_modifier = lambda col : str(col + "_customer")

# Apply the function to the desired columns
new_columns_names = list(map(lambda x : func_modifier(x) if x in columns_to_add_sufix else x, D))

D.columns = new_columns_names

## E: All left tables

In [26]:
E = df_orders_dataset.merge(df_order_payments_dataset, how='left', on='order_id') \
                 .merge(df_order_reviews_dataset, how='left', on='order_id') \
                 .merge(D, how='left', on='customer_id')

## FULL TABLE

In [27]:
df_full_table = E.merge(C, how='left', on='order_id')

### Coverting to timestamp

In [28]:
orders_date_columns = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']

for date_col in orders_date_columns:
    df_full_table[date_col] = pd.to_datetime(df_full_table[date_col])

In [29]:
# Python 3 program to calculate Distance Between Two Points on Earth
from math import radians, cos, sin, asin, sqrt
def distance(lat1, lat2, lon1, lon2):
     
    # The math module contains a function named
    # radians which converts from degrees to radians.
    lon1 = radians(lon1)
    lon2 = radians(lon2)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
      
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
 
    c = 2 * asin(sqrt(a))
    
    # Radius of earth in kilometers. Use 3956 for miles
    r = 6371
      
    # calculate the result
    return(c * r)
     
     
# driver code
lat1 = 53.32055555555556
lat2 = 53.31861111111111
lon1 = -1.7297222222222221
lon2 =  -1.6997222222222223
print(distance(lat1, lat2, lon1, lon2), "K.M")

2.0043678382716137 K.M


In [30]:
coords_cols = [col for col in df_full_table.columns if (('geolocation_lat' in col) or ('geolocation_lng' in col))]
df_full_table[['seller_city', 'customer_city', 'seller_id', 'customer_id'] + coords_cols]

Unnamed: 0,seller_city,customer_city,seller_id,customer_id,geolocation_lat_customer,geolocation_lng_customer,geolocation_lat_seller,geolocation_lng_seller
0,maua,sao paulo,3504c0cb71d7fa48d967e0e4c94d59d9,9ef432eb6251297304e76186b10a928d,-23.574809,-46.587471,-23.680114,-46.452454
1,maua,sao paulo,3504c0cb71d7fa48d967e0e4c94d59d9,9ef432eb6251297304e76186b10a928d,-23.574809,-46.587471,-23.680114,-46.452454
2,maua,sao paulo,3504c0cb71d7fa48d967e0e4c94d59d9,9ef432eb6251297304e76186b10a928d,-23.574809,-46.587471,-23.680114,-46.452454
3,belo horizonte,barreiras,289cdb325fb7e7f891c38608bf9e0962,b0830fb4747a6c6d20dea0b8c802d7ef,-12.169860,-44.988369,-19.810119,-43.984727
4,guariba,vianopolis,4869f7a5dfa277a7dca6462dcf3b52b2,41ce2a54c0b03bf3443c3d931a367089,-16.746337,-48.514624,-21.362358,-48.232976
...,...,...,...,...,...,...,...,...
119138,tupa,praia grande,1f9ab4708f3056ede07124aad39a2554,1fca14ff2861355f6e5f14306ff977a7,-24.001467,-46.446355,-21.935321,-50.497562
119139,sao paulo,nova vicosa,d50d79cb34e38265a8649c383dcffd48,1aa71eb042121263aafbe80c1b562c9c,-17.891522,-39.370942,-23.551013,-46.448489
119140,ilicinea,japuiba,a1043bafd471dff536d0c462352beb48,b331b74b18dc79bcdf6532d51e1637c1,-22.555985,-42.690761,-20.944706,-45.827098
119141,ilicinea,japuiba,a1043bafd471dff536d0c462352beb48,b331b74b18dc79bcdf6532d51e1637c1,-22.555985,-42.690761,-20.944706,-45.827098


In [31]:
# df_full_table.to_csv('df_full.csv', sep=';', decimal='.')

In [32]:
def calculate_distance_between_seller_and_customer(seller_id, customer_id):
    seller_lat = df_full_table[df_full_table['seller_id'] == seller_id]['geolocation_lat_seller'].head(1).values[0]
    seller_lng = df_full_table[df_full_table['seller_id'] == seller_id]['geolocation_lng_seller'].head(1).values[0]

    customer_lat = df_full_table[df_full_table['customer_id'] == customer_id]['geolocation_lat_customer'].head(1).values[0]
    customer_lng = df_full_table[df_full_table['customer_id'] == customer_id]['geolocation_lng_customer'].head(1).values[0]
    
    return distance(lat1=seller_lat, lat2=customer_lat, lon1=seller_lng , lon2=customer_lng)

In [33]:
orders_date_columns = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']

In [34]:
df_customer_seller_distance_analysis = df_full_table.drop_duplicates('order_id')[['customer_id', 'seller_id'] + coords_cols + orders_date_columns]

In [35]:
df_customer_seller_distance_analysis = df_customer_seller_distance_analysis.dropna(subset=['customer_id', 'seller_id'])

In [None]:
%%time
df_customer_seller_distance_analysis['distance'] = \
                            df_customer_seller_distance_analysis.apply(lambda row : distance(lat1=row['geolocation_lat_seller'],
                                     lat2=row['geolocation_lat_customer'],
                                     lon1=row['geolocation_lng_seller'] ,
                                     lon2=row['geolocation_lng_customer']), axis=1)

# CORRELATION Hours to deliever X  Distance

In [None]:
df_customer_seller_distance_analysis

# Creating Video with Sellers/Customer evolution on Map

In [None]:
df_video = df_customer_seller_distance_analysis[['order_purchase_timestamp','geolocation_lat_customer','geolocation_lng_customer','geolocation_lat_seller','geolocation_lng_seller']]

In [None]:
df_video['purchase_date'] = df_video['order_purchase_timestamp'].dt.date

In [None]:
dates = df_video.sort_values('purchase_date', ascending=True).reset_index(drop=True)['purchase_date']

In [None]:
unique_dates = dates.unique()

In [None]:
# import pandas as pd
# from datetime import datetime

# MIN = df_full_table['order_purchase_timestamp'].min()
# MAX = df_full_table['order_purchase_timestamp'].max()

# dates = pd.date_range(start=MIN, end=MAX, freq='W')

In [None]:
df_aux = df_video[df_video['purchase_date'] <= date]
# Para tirar a densidade do gráfico, exibe uma amostra de 10% do total

fig, ax = plt.subplots(figsize = (18,21))

sns.scatterplot(data=df_aux,
                x='geolocation_lng_customer',
                y='geolocation_lat_customer',
#                 hue='Cluster',
                alpha=0.8,
                size=200,
#                 sizes=(100, 300),
                linewidth=0.4,
                palette='dark')


sns.scatterplot(data=df_aux,
                x='geolocation_lng_seller',
                y='geolocation_lat_seller',
#                 hue='Cluster',
                alpha=0.8,
                size=800,
                color='red',
#                 sizes=(100, 300),
                linewidth=0.0,
                palette='dark')


plt.style.use('dark_background')
ax.set_xlim(-70,-32);
ax.set_ylim(-36,5);

In [None]:
%%time

for date in unique_dates:
    df_aux = df_video[df_video['purchase_date'] <= date]
    # Para tirar a densidade do gráfico, exibe uma amostra de 10% do total

    fig, ax = plt.subplots(figsize = (18,21))

    sns.scatterplot(data=df_aux,
                    x='geolocation_lng_customer',
                    y='geolocation_lat_customer',
                    alpha=0.8,
                    size=200,
                    linewidth=0.4,
                    palette='dark')


    sns.scatterplot(data=df_aux,
                    x='geolocation_lng_seller',
                    y='geolocation_lat_seller',
                    alpha=0.8,
                    size=800,
                    color='red',
                    linewidth=0.0,
                    palette='dark')


    plt.style.use('dark_background')
    ax.set_xlim(-70,-32);
    ax.set_ylim(-36,5);

    plt.savefig(f'./evolution_map_daily_and_seller/customers_date_{date}.png')

In [None]:
import cv2
import numpy as np
import glob

img_array = []
for filename in glob.glob('./evolution_map_daily_and_seller/*.png'):
    img = cv2.imread(filename)
    height, width, layers = img.shape
    size = (width,height)
    img_array.append(img)


out = cv2.VideoWriter('video_customers_evolution_map_daily_and_seller.avi',cv2.VideoWriter_fourcc(*'DIVX'), 15, size)
 
for i in range(len(img_array)):
    out.write(img_array[i])
out.release()

In [None]:
# ! pip install opencv-python

In [None]:
import matplotlib.pyplot as plt

x=[-1 ,0.5 ,1,-0.5]
y=[ 0.5,  1, -0.5, -1]

plt.plot(x,y, 'ro')

def connectpoints(x,y,p1,p2):
    x1, x2 = x[p1], x[p2]
    y1, y2 = y[p1], y[p2]
    plt.plot([x1,x2],[y1,y2],'k-')

connectpoints(x,y,0,1)
connectpoints(x,y,2,3)

plt.axis('equal')
plt.show()

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.cluster import KMeans


N_CLUSTER = 8
INIT = 'k-means++'
N_INIT = 10
MAX_ITER = 300


kmeans = KMeans(n_clusters = N_CLUSTER, #numero de clusters
                init = INIT,
                n_init = N_INIT, #algoritmo que define a posição dos clusters de maneira mais assertiva
                max_iter = MAX_ITER)

COLUMNS_TO_CLUSTER = [ # tirei 'number_of_orders'
                   'avg_distance_scaled',
                   'avg_ticket_scaled',
                   'lat_scaled', 'lng_scaled',
                   'avg_hours_to_deliver_scaled',
                   'avg_hours_delay_scaled',
                   'avg_freight_value_scaled']


kmeans.fit(df_concated[COLUMNS_TO_CLUSTER])

df_concated['Cluster'] = kmeans.predict(df_concated[COLUMNS_TO_CLUSTER])