<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [2]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

# Data Cleaning

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
from datetime import datetime, date
from sklearn.cluster import KMeans

## Order Dataset

In [4]:
order_df = pd.read_csv('olist_orders_dataset.csv')
order_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
order_id                         99441 non-null object
customer_id                      99441 non-null object
order_status                     99441 non-null object
order_purchase_timestamp         99441 non-null object
order_approved_at                99281 non-null object
order_delivered_carrier_date     97658 non-null object
order_delivered_customer_date    96476 non-null object
order_estimated_delivery_date    99441 non-null object
dtypes: object(8)
memory usage: 6.1+ MB


In [5]:
order_df[order_df.duplicated()]

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date


In [6]:
order_df.drop(['order_status', 'order_delivered_carrier_date'], axis='columns', inplace=True)
for column in ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_customer_date', 'order_estimated_delivery_date']:
    print(column)
    order_df[column] = pd.to_datetime(order_df[column])
order_df.info()

order_purchase_timestamp
order_approved_at
order_delivered_customer_date
order_estimated_delivery_date
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 6 columns):
order_id                         99441 non-null object
customer_id                      99441 non-null object
order_purchase_timestamp         99441 non-null datetime64[ns]
order_approved_at                99281 non-null datetime64[ns]
order_delivered_customer_date    96476 non-null datetime64[ns]
order_estimated_delivery_date    99441 non-null datetime64[ns]
dtypes: datetime64[ns](4), object(2)
memory usage: 4.6+ MB


In [7]:
order_df.head()

Unnamed: 0,order_id,customer_id,order_purchase_timestamp,order_approved_at,order_delivered_customer_date,order_estimated_delivery_date
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-10 21:25:13,2017-10-18
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-08-07 15:27:45,2018-08-13
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-17 18:06:29,2018-09-04
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-12-02 00:28:42,2017-12-15
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-16 18:17:02,2018-02-26


## Payment Dataset

In [8]:
payment_df = pd.read_csv('olist_order_payments_dataset.csv')
payment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
order_id                103886 non-null object
payment_sequential      103886 non-null int64
payment_type            103886 non-null object
payment_installments    103886 non-null int64
payment_value           103886 non-null float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [9]:
payment_df[payment_df.duplicated()]

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value


In [10]:
payment_df.head()

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


In [None]:
#payment_df.drop(['payment_sequential', 'payment_type','payment_installments'], axis='columns', inplace=True)
#payment_df.head()

In [None]:
payment_df.nunique()

## Customer Dataset

In [None]:
customer_df = pd.read_csv('olist_customers_dataset.csv')

In [None]:
customer_df.info()

In [None]:
customer_df.drop(['customer_zip_code_prefix', 'customer_city'], axis='columns', inplace=True)
customer_df.columns

In [None]:
customer_df[customer_df.duplicated()]

In [None]:
customer_df.nunique()

In [None]:
customer_df.head()

## Review Dataset

In [None]:
review_df = pd.read_csv('olist_order_reviews_dataset.csv')
review_df.info()

In [None]:
review_df.head()

In [None]:
for columns in ['review_comment_title', 'review_comment_message']:
    # replace present values with 1
    review_df.loc[review_df[columns].notnull(), columns] = 1
    # replace null values with 0
    review_df.loc[review_df[columns].isna(), columns] = 0
review_df

In [None]:
plt.hist(review_df.review_score);

In [None]:
customer_df.drop(['customer_city', 'customer_state'], axis='columns', inplace=True)
customer_df.head()

In [None]:
order_df['purchase_date'] = order_df['order_purchase_timestamp'].dt.date
order_df.drop('order_purchase_timestamp', axis='columns', inplace=True)
order_df

In [None]:
payment_order_df = pd.merge(payment_df, order_df, how='left', on='order_id')
payment_order_df

In [None]:
pay_order_cust_df = pd.merge(payment_order_df, customer_df, how='left', on='customer_id')
pay_order_cust_df

In [None]:
plt.bar(pay_order_cust_df.groupby('payment_type').sum().index, 
        pay_order_cust_df.groupby('payment_type').sum()['payment_value']/pay_order_cust_df.groupby('payment_type').count()['payment_value'])

In [None]:
plt.bar(pay_order_cust_df.groupby('customer_state').sum().index, 
        pay_order_cust_df.groupby('customer_state').sum()['payment_value']/pay_order_cust_df.groupby('customer_state').count()['payment_value'])

# Exploratory Data Analytics

In [None]:
plt.scatter(pay_order_cust_df.payment_installments, pay_order_cust_df.payment_value)

In [None]:
pay_order_cust_df['delay'] = (pay_order_cust_df.order_delivered_customer_date-pay_order_cust_df.order_estimated_delivery_date).dt.days
pay_order_cust_df

In [None]:
plt.plot(pay_order_cust_df.delay)

In [None]:
plt.hist(pay_order_cust_df.delay, bins=50);

In [None]:
pay_order_cust_df.fillna(pay_order_cust_df.median(), inplace=True)

In [None]:
pocr_df = pd.merge(pay_order_cust_df, review_df, how='left', on='order_id')
pocr_df

In [None]:
poc1_df = pocr_df[(date(2016,10,1) <= pocr_df.purchase_date) &
                              (pocr_df.purchase_date < date(2017,10,1))]
poc2_df = pocr_df[(date(2017,10,1) <= pocr_df.purchase_date) & 
                              (pocr_df.purchase_date < date(2018,10,1))]

In [None]:
df1 = poc1_df.groupby('customer_unique_id').agg({'payment_installments':'mean'})
#df1 = poc1_df.groupby('customer_unique_id').agg({'payment_installments':'mean', 'delay':'max'})
#df1 = poc1_df.groupby('customer_unique_id').agg({'payment_installments':'mean', 'review_score':'mean'})


In [None]:
type_series = poc1_df.groupby('customer_unique_id')['payment_type'].agg(lambda x:x.value_counts().index[0])
type_series

In [None]:
df1 = pd.merge(df1, type_series, left_on='customer_unique_id', right_index=True)
df1

In [None]:
customer_df1 = customer_df.set_index('customer_unique_id')['customer_state']
customer_df1

In [None]:
df1 = pd.merge(df1, customer_df[['customer_unique_id', 'customer_state']], how='left', on='customer_unique_id')
df1

In [None]:
df1 = pd.merge(df1, customer_df.set_index('customer_unique_id')['customer_state'], left_on='customer_unique_id', right_index=True)
df1

In [None]:
df1 = pd.get_dummies(df1)
df1

In [None]:
df2 = poc2_df.groupby('customer_unique_id').agg({'payment_value':'sum'})
df2

In [None]:
common_cust = pd.merge(df1, df2, on='customer_unique_id')
common_cust.info()
common_cust.head()

In [None]:
plt.scatter(common_cust.review_score, common_cust.payment_value)

In [None]:
plt.scatter(common_cust.payment_installments, common_cust.payment_value)

In [None]:
plt.scatter(common_cust.delay, common_cust.payment_value)

In [None]:
boleto_series = common_cust.loc[common_cust.payment_type_boleto==1, 'payment_value']
credit_series = common_cust.loc[common_cust.payment_type_credit_card==1, 'payment_value']
debit_series = common_cust.loc[common_cust.payment_type_debit_card==1, 'payment_value']
voucher_series = common_cust.loc[common_cust.payment_type_voucher==1, 'payment_value']

type_dict = {}
type_dict['boleto'] = boleto_series.sum()/boleto_series.count()
type_dict['credit'] = credit_series.sum()/credit_series.count()
type_dict['debit'] = debit_series.sum()/debit_series.count()
type_dict['voucher'] = voucher_series.sum()/voucher_series.count()

type_dict

In [None]:
X = common_cust.iloc[:,:-1]
y = common_cust.iloc[:,-1]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# logistic regression 
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
print('Train Accuracy: ', lr.score(X_train, y_train))
print('Test Accuracy: ', lr.score(X_test, y_test))

from sklearn.model_selection import cross_val_score
print('CV Score:', np.mean(cross_val_score(lr, X_train, y_train, cv = 5)))

In [None]:
rfm1_df = poc1_df.groupby('customer_unique_id').agg({'purchase_date':'max', 'order_id':'count', 'payment_value':'sum'})
rfm1_df.columns = ['recency', 'frequency', 'monetary']
rfm1_df

In [None]:
rfm1_df.recency = rfm1_df.recency.apply(lambda x: (poc1_df.purchase_date.max()-x).days)
rfm1_df

In [None]:
#rfm1_df = pd.read_csv('rfm3m.csv', index_col=0)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(rfm1_df)
X_scaled = scaler.transform(rfm1_df)
rfm1_km = KMeans(n_clusters=4).fit(X_scaled)

In [None]:
plt.scatter(rfm1_df.recency, rfm1_df.monetary, c=rfm1_km.labels_)

In [None]:
plt.scatter(rfm1_df.frequency, rfm1_df.monetary, c=rfm1_km.labels_)

In [None]:
rfm2_df = poc2_df.groupby('customer_unique_id').agg({'purchase_date':'max', 'order_id':'count', 'payment_value':'sum'})
rfm2_df.columns = ['recency', 'frequency', 'monetary']
rfm2_df

In [None]:
m2 = rfm2_df[['monetary']]

In [None]:
clust1_df = pd.DataFrame()
clust1_df['customer_unique_id'] = rfm1_df.index
clust1_df['segment'] = rfm1_km.labels_
clust1_df

In [None]:
common_cust = pd.merge(clust1_df, m2, on='customer_unique_id')
common_cust

In [None]:
X = np.array(common_cust.segment).reshape(-1, 1)
y = common_cust.monetary

In [None]:
plt.scatter(X, y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

from sklearn.linear_model import LinearRegression
lr = LinearRegression().fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

In [None]:
lr.coef_

In [None]:
rfm1_m2_df = pd.merge(rfm1_df, m2, left_index=True, right_index=True)
rfm1_m2_df

In [None]:
plt.scatter(rfm1_m2_df.recency, rfm1_m2_df.monetary_y)

In [None]:
plt.scatter(rfm1_m2_df.frequency, rfm1_m2_df.monetary_y)

In [None]:
plt.scatter(rfm1_m2_df.monetary_x, rfm1_m2_df.monetary_y)

In [None]:
X = rfm1_m2_df.iloc[:,:-1]
y = rfm1_m2_df.iloc[:,-1]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# logistic regression 
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
print('Train Accuracy: ', lr.score(X_train, y_train))
print('Test Accuracy: ', lr.score(X_test, y_test))

In [None]:
lr.coef_

In [None]:
common_cust[['monetary']]

In [None]:
m6m_km = KMeans(n_clusters=2)
m6m_km.fit(common_cust[['monetary']])
plt.scatter(common_cust.monetary, np.zeros([1,common_cust.shape[0]]), c=m6m_km.labels_)

In [None]:
1-m6m_km.labels_.mean()

In [None]:
plt.scatter(common_cust.rfm_clust_3m, m6m_km.labels_)

In [None]:
from sklearn.linear_model import LogisticRegression 
clust_lr = LogisticRegression().fit(common_cust[['rfm_clust_3m']], m6m_km.labels_)

In [None]:
clust_lr.score(common_cust[['rfm_clust_3m']], m6m_km.labels_)