# CRO Project data mining / exploration of the historical bias 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
from scipy.stats import chisquare
from datetime import datetime

# Hide deprecated warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
sellers_data = pd.read_csv('/olist_sellers_dataset.csv')
items_data = pd.read_csv('/olist_order_items_dataset.csv')

In [3]:
sellers_data.head()

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP
1,d1b65fc7debc3361ea86b5f14c68d2e2,13844,mogi guacu,SP
2,ce3ad9de960102d0677a81f5d0bb7b2d,20031,rio de janeiro,RJ
3,c0f3eea2e14555b6faeea3dd58c1b1c3,4195,sao paulo,SP
4,51a04a8a6bdcb23deccc82b0b80742cf,12914,braganca paulista,SP


In [4]:
sellers_data.shape

(3095, 4)

In [5]:
items_data.shape

(112650, 7)

In [6]:
sellers_items_df = pd.merge(sellers_data, items_data, on='seller_id', how='left')

In [7]:
sellers_items_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 10 columns):
seller_id                 112650 non-null object
seller_zip_code_prefix    112650 non-null int64
seller_city               112650 non-null object
seller_state              112650 non-null object
order_id                  112650 non-null object
order_item_id             112650 non-null int64
product_id                112650 non-null object
shipping_limit_date       112650 non-null object
price                     112650 non-null float64
freight_value             112650 non-null float64
dtypes: float64(2), int64(2), object(6)
memory usage: 9.5+ MB


### Preperation of datetime data

In [8]:
#Transform column shipping_limit_date to pd.to_datetime
sellers_items_df['shipping_limit_date'] = pd.to_datetime(sellers_items_df['shipping_limit_date'],format='%Y-%m-%d',utc=False)


In [9]:
#Create column with year values
year_values = sellers_items_df['shipping_limit_date'].dt.year
sellers_items_df['year'] = year_values.values

In [10]:
#Create column with month values 
month_values = sellers_items_df['shipping_limit_date'].dt.month
sellers_items_df['month'] = month_values.values

In [11]:
#Create column short_date with values month/year as strings
short_date = sellers_items_df['month'].astype(str) + '/' + sellers_items_df['year'].astype(str)

In [12]:
#Add column short_date to the principal dataframe
sellers_items_df['short_date'] =short_date.values

In [13]:
# #Create a dictionary with season categories

mapping={1:'summer', 2: 'summer', 3: 'summer', 4:'autumn', 5:'autumn', 6:'autumn',7:'winter',8:'autumn',
         9:'winter', 10:'spring', 11:'spring', 12:'spring'}

#Do the mapping
sellers_items_df['month'].map(mapping)

 #Add the mapping to a new column season and concatenate the year
sellers_items_df['season'] = sellers_items_df['month'].map(mapping) #+ '/' + sellers_items_df['year'].astype(str)

sellers_items_df['season_year'] = sellers_items_df['month'].map(mapping) + '/' + sellers_items_df['year'].astype(str)

In [14]:
sellers_items_df.head(2)

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state,order_id,order_item_id,product_id,shipping_limit_date,price,freight_value,year,month,short_date,season,season_year
0,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,4a90af3e85dd563884e2afeab1091394,1,ffb64e34a37740dafb6c88f1abd1fa61,2017-08-25 20:50:19,106.2,9.56,2017,8,8/2017,autumn,autumn/2017
1,3442f8959a84dea7ee197c632cb2df15,13023,campinas,SP,6d953888a914b67350d5bc4d48f2acab,1,f4621f8ad6f54a2e3c408884068be46d,2017-05-11 16:25:11,101.7,15.92,2017,5,5/2017,autumn,autumn/2017


## Exploring data historial bias

#### Check the experience of each seller: months per seller

In [15]:
#How many months each seller have been selling
month_per_seller = sellers_items_df.groupby('seller_id').short_date.nunique()

In [16]:
month_per_seller

seller_id
0015a82c2db000af6aaaf3ae2ecb0532     1
001cca7ae9ae17fb1caed9dfb1094831    18
001e6ad469a905060d959994f1b41e4f     1
002100f778ceb8431b7a1020ff7ab48f     8
003554e2dce176b5555353e4f3555ac8     1
004c9cd9d87a3c30c522c48c4fc07416    16
00720abe85ba0859807595bbf045a33b     6
00ab3eff1b5192e5f1a63bcecfee11c8     1
00d8b143d12632bad99c0ad66ad52825     1
00ee68308b45bc5e2660cd833c3f81cc    11
00fc707aaaad2d31347cf883cd2dfe10     5
010543a62bd80aa422851e79a3bc7540     2
010da0602d7774602cd1b3f5fb7b709e     1
011b0eaba87386a2ae96a7d32bb531d1     1
01266d4c46afa519678d16a8b683d325     1
013900e863eace745d3ec7614cab5b1a     7
014c0679dd340a0e338872e7ec85666a    10
014d9a685fd57276679edd00e07089e5     5
0176f73cc1195f367f7b32db1e5b3aa8    10
01bcc9d254a0143f0ce9791b960b2a47     5
01c97ebb5cdac52891c0ed1c37ba0012     7
01cf7e3d21494c41fb86034f2e714fa1    10
01ed254b9ff8407dfb9d99ba1e17d923     3
01fd077212124329bac32490e8ef80d9     5
01fdefa7697d26ad920e9e0346d4bd1b    16
0241d4d5d36f10f

In [21]:
#transform series month_per_seller to pd dataframe
month_per_seller_df = month_per_seller.to_frame().reset_index()
month_per_seller_df = month_per_seller_df.rename(columns={"short_date": "months_per_seller"})

In [22]:
month_per_seller_df.head()

Unnamed: 0,seller_id,months_per_seller
0,0015a82c2db000af6aaaf3ae2ecb0532,1
1,001cca7ae9ae17fb1caed9dfb1094831,18
2,001e6ad469a905060d959994f1b41e4f,1
3,002100f778ceb8431b7a1020ff7ab48f,8
4,003554e2dce176b5555353e4f3555ac8,1


#### Engage seller 1: Orders / season / seller

In [23]:
#check the lenght of total orders ids to check duplicates
len(sellers_items_df['order_id'])

112650

In [24]:
#Check there are duplicates, so we will apply drop_duplicates on the groupby's.
len(sellers_items_df[['order_id', 'seller_id','season_year']].drop_duplicates())

100010

In [25]:
#order per season per seller count
orders_per_season_seller = sellers_items_df[['order_id','seller_id',
                'season']].drop_duplicates().groupby(['seller_id',
                                        'season']).count().reset_index().sort_values(by='seller_id')
orders_per_season_seller = orders_per_season_seller.rename(columns={"order_id": "orders_season"})

#orders per season mean
orders_per_season_mean = orders_per_season_seller.groupby('seller_id').mean()
#rename column mean
orders_per_season_mean = orders_per_season_mean.rename(columns={"orders_season": "orders_season_mean"})

#orders per season max
orders_per_season_max = orders_per_season_seller.groupby('seller_id').max()
#rename column max
orders_per_season_max = orders_per_season_max.rename(columns={"orders_season": "orders_season_max"})

#orders per season min
orders_per_season_min = orders_per_season_seller.groupby('seller_id').min()
#rename column min
orders_per_season_min = orders_per_season_min.rename(columns={"orders_season": "orders_season_min"})

In [26]:
orders_per_season_seller.head()

Unnamed: 0,seller_id,season,orders_season
0,0015a82c2db000af6aaaf3ae2ecb0532,spring,3
1,001cca7ae9ae17fb1caed9dfb1094831,autumn,80
2,001cca7ae9ae17fb1caed9dfb1094831,spring,54
3,001cca7ae9ae17fb1caed9dfb1094831,summer,31
4,001cca7ae9ae17fb1caed9dfb1094831,winter,35


In [27]:
orders_season_mean = pd.pivot_table(orders_per_season_seller, values='orders_season', index=['seller_id'],
                    columns=['season'], aggfunc=np.mean).fillna(0)

In [28]:
#merge df with: sellerid + mean + max + min
m1 = pd.merge(month_per_seller_df, orders_season_mean, on='seller_id', how='left')

In [35]:
#Drop unnecessary columns
seasons_clean = seasons.drop(['season_year_x','season_year_y'], axis = 1) 

In [30]:
#merge 2
a_2 = pd.merge(month_per_seller_df, orders_per_season_seller, on='seller_id', how='left')

In [31]:
#orders_per_season_max
orders_per_season_max = sellers_items_df[['order_id','seller_id','season_year']].drop_duplicates().groupby(['seller_id',
                                                         'season_year']).max().reset_index().sort_values(by='seller_id')
orders_per_season_max = orders_per_season_max.rename(columns={"order_id": "orders_season_max"})
b_2 = pd.merge(a_2, orders_per_season_max, on='seller_id', how='left')

#### Engage seller 2: orders/year/seller

In [32]:
orders_per_year_seller = sellers_items_df[['order_id','seller_id','year',]].drop_duplicates().groupby(['seller_id',
                                                    'year']).count().reset_index().sort_values(by='seller_id')
orders_per_year_seller = orders_per_year_seller.rename(columns={"order_id": "orders_x_year_seller"})

c_2 = pd.merge(a_2, orders_per_year_seller, on='seller_id', how='left')


In [33]:
orders_year = pd.pivot_table(orders_per_year_seller, values='orders_x_year_seller', index=['seller_id'],
                    columns=['year'], aggfunc=np.sum).fillna(0)

In [34]:
orders_year.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3095 entries, 0015a82c2db000af6aaaf3ae2ecb0532 to ffff564a4f9085cd26170f4732393726
Data columns (total 4 columns):
2016    3095 non-null float64
2017    3095 non-null float64
2018    3095 non-null float64
2020    3095 non-null float64
dtypes: float64(4)
memory usage: 120.9+ KB


In [35]:
orders_year = orders_year.drop([2020], axis = 1) 

In [36]:
m2 = pd.merge(m1, orders_year, on='seller_id', how='left')

In [37]:
orders_per_year_seller_max = sellers_items_df[['order_id','seller_id','year',]].drop_duplicates().groupby(['seller_id','year']).max().reset_index().sort_values(by='seller_id')
orders_per_year_seller_max = orders_per_year_seller_max.rename(columns={"order_id": "orders_x_year_seller_max"})

d_2 = pd.merge(c_2, orders_per_year_seller_max, on='seller_id', how='left')

#### Engage seller 3: orders/month/seller

In [38]:
orders_month_seller = sellers_items_df[['order_id','seller_id','short_date']].drop_duplicates().groupby(['seller_id',
                                                                    'short_date']).count().reset_index().sort_values(by='seller_id')

orders_month_seller = orders_month_seller.rename(columns={"order_id": "orders_month_seller"})

#orders per month mean
orders_month_seller_mean = orders_month_seller.groupby('seller_id').mean()
#rename column mean
orders_month_seller_mean = orders_month_seller_mean.rename(columns={"orders_month_seller": "orders_month_mean"})

#orders per month max
orders_month_seller_max = orders_month_seller.groupby('seller_id').max()
#rename column max
orders_month_seller_max = orders_month_seller_max.rename(columns={"orders_month_seller": "orders_month_max"})

#orders per month min
orders_month_seller_min = orders_month_seller.groupby('seller_id').min()
#rename column min
orders_month_seller_min = orders_month_seller_min.rename(columns={"orders_month_seller": "orders_month_min"})

In [39]:
#merge 1
month_1 = pd.merge(orders_month_seller_mean, orders_month_seller_max, on='seller_id', how='left')

#merge 2
month_df = pd.merge(month_1, orders_month_seller_min, on='seller_id', how='left')

In [40]:
month_df.head()

Unnamed: 0_level_0,orders_month_mean,short_date_x,orders_month_max,short_date_y,orders_month_min
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0015a82c2db000af6aaaf3ae2ecb0532,3.0,10/2017,3,10/2017,3
001cca7ae9ae17fb1caed9dfb1094831,11.111111,9/2017,24,1/2018,1
001e6ad469a905060d959994f1b41e4f,1.0,8/2018,1,8/2018,1
002100f778ceb8431b7a1020ff7ab48f,6.375,9/2017,15,1/2018,2
003554e2dce176b5555353e4f3555ac8,1.0,12/2017,1,12/2017,1


In [41]:
#Drop unnecessary columns
month_clean = month_df.drop(['short_date_y'], axis = 1) 

In [42]:
month_clean.head()

Unnamed: 0_level_0,orders_month_mean,short_date_x,orders_month_max,orders_month_min
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0015a82c2db000af6aaaf3ae2ecb0532,3.0,10/2017,3,3
001cca7ae9ae17fb1caed9dfb1094831,11.111111,9/2017,24,1
001e6ad469a905060d959994f1b41e4f,1.0,8/2018,1,1
002100f778ceb8431b7a1020ff7ab48f,6.375,9/2017,15,2
003554e2dce176b5555353e4f3555ac8,1.0,12/2017,1,1
