## Goods

In [1]:
import numpy as np 
import pandas as pd

goods = pd.read_csv('goods.csv', sep=';')

print('Size of the data set: {} rows and {} columns'.format(*goods.shape))
goods.head()

Size of the data set: 34316 rows and 5 columns


Unnamed: 0,id,name,price,oldPrice,visible
0,617,Носки детские,69.0,69.0,0.0
1,618,Рейтузы детские для девочек,349.0,449.0,0.0
2,686,Трусы детские для девочек 3шт,249.0,249.0,0.0
3,687,Трусы детские для девочек 3шт,249.0,249.0,0.0
4,688,Трусы детские для девочек 2шт,199.0,299.0,0.0


In [2]:
for col in goods.columns:
    print('{} missing values in {} column'.format(goods[col].isnull().sum().sum(), col))

0 missing values in id column
1 missing values in name column
1 missing values in price column
1 missing values in oldPrice column
1 missing values in visible column


In [3]:
goods.dropna(inplace=True)
print('Size of the data set: {} rows and {} columns'.format(*goods.shape))

Size of the data set: 34315 rows and 5 columns


## Order goods

In [14]:
order_goods = pd.read_csv('order-goods.csv', sep=';', 
                          encoding='ISO-8859-1', dtype={'price': 'object'})

order_goods.price = order_goods.price.apply(lambda x: float((str(x).replace(' ', ''))))

print('Size of the data set: {} rows and {} columns'.format(*order_goods.shape))
order_goods.head()

Size of the data set: 897933 rows and 7 columns


Unnamed: 0,orderId,goodId,size,qty,price,originalPrice,originalOldPrice
0,451752.0,37101.0,70B,2.0,1.0,499.0,1099.0
1,451752.0,37129.0,S,2.0,499.0,499.0,0.0
2,451752.0,42809.0,one size,1.0,1.0,399.0,0.0
3,451752.0,37101.0,75C,1.0,499.0,499.0,1099.0
4,451752.0,42784.0,S,1.0,1.0,499.0,0.0


In [15]:
for col in order_goods.columns:
    print('{} missing values in {} column'.format(order_goods[col].isnull().sum().sum(), col))

3889 missing values in orderId column
1 missing values in goodId column
1 missing values in size column
1 missing values in qty column
1 missing values in price column
1 missing values in originalPrice column
1 missing values in originalOldPrice column


In [16]:
# order_goods[pd.isnull(order_goods.orderId) == True].head(2)
order_goods.dropna(inplace=True)

print('Size of the data set: {} rows and {} columns'.format(*order_goods.shape))

Size of the data set: 894044 rows and 7 columns


In [87]:
order_goods.groupby(order_goods.columns.tolist(), as_index=False).size().head()

order_id  goodId   size  qty  price   originalPrice  originalOldPrice
173042.0  19546.0  146   1.0  599.0   299.0          599.0               1
          19549.0  140   1.0  599.0   299.0          599.0               1
          23192.0  140   1.0  599.0   599.0          0.0                 1
          23206.0  140   1.0  1499.0  599.0          1499.0              1
          23223.0  146   1.0  699.0   499.0          699.0               1
dtype: int64

## Orders

In [17]:
orders = pd.read_csv('orders.csv', sep=';', dtype={'erp_status': 'object'})

orders.accept_time = pd.to_datetime(orders.accept_time)

print('Size of the data set: {} rows and {} columns'.format(*orders.shape))
orders.head()

Size of the data set: 174063 rows and 9 columns


Unnamed: 0,id,from_site,status,payment_type,payed,accept_time,total_cost,user_id,erp_status
0,451752.0,inflin,6.0,cash,0.0,2017-05-31 23:45:40,2399.0,357892.0,-
1,451751.0,inflin,6.0,cash,0.0,2017-05-31 23:44:14,2200.0,82507.0,-
2,451750.0,concept,6.0,cash,0.0,2017-05-31 23:39:14,6000.0,316491.0,-
3,451749.0,acoola,6.0,cash,0.0,2017-05-31 23:37:19,2101.0,336144.0,-
4,451746.0,concept,6.0,cash,0.0,2017-05-31 23:27:22,6300.0,332755.0,-


In [18]:
for col in orders.columns:
    print('{} missing values in {} column'.format(orders[col].isnull().sum().sum(), col))

1371 missing values in id column
1 missing values in from_site column
1 missing values in status column
74 missing values in payment_type column
1 missing values in payed column
1 missing values in accept_time column
1 missing values in total_cost column
1 missing values in user_id column
1 missing values in erp_status column


In [19]:
orders.dropna(inplace=True)
print('Size of the data set: {} rows and {} columns'.format(*orders.shape))

Size of the data set: 172619 rows and 9 columns


In [20]:
# Статусы заказов: 
# 3 - Оплачен, формируется к отправке
# 5 - Отменен
# 6 - Отправлен 
# 7 - Продажа оформлена

orders.status.value_counts()

6.0    148032
5.0     24540
3.0        43
7.0         4
Name: status, dtype: int64

## Data

In [47]:
order_goods.rename(columns={'orderId': 'order_id'}, inplace=True)
orders.rename(columns={'id': 'order_id'}, inplace=True)

In [94]:
data = pd.merge(orders, order_goods, how='inner', on='order_id')
data.shape

(667879, 15)

In [95]:
# orders[orders.user_id == 82507.0]
# order_goods[order_goods.order_id == 451751.0]
# order_goods.price[order_goods.order_id == 451751.0].sum()

data.head(2)

Unnamed: 0,order_id,from_site,status,payment_type,payed,accept_time,total_cost,user_id,erp_status,goodId,size,qty,price,originalPrice,originalOldPrice
0,451752.0,inflin,6.0,cash,0.0,2017-05-31 23:45:40,2399.0,357892.0,-,37101.0,70B,2.0,1.0,499.0,1099.0
1,451752.0,inflin,6.0,cash,0.0,2017-05-31 23:45:40,2399.0,357892.0,-,37129.0,S,2.0,499.0,499.0,0.0


## RFM table

In [103]:
NOW = dt.datetime(2014,12,31)
NOW

datetime.datetime(2014, 12, 31, 0, 0)

In [106]:
import datetime as dt

now = dt.datetime.now()

In [108]:
rfm_table = data.groupby('user_id').agg({'accept_time': lambda x: (now - x.max()).days, # Recency
                                         'order_id': lambda x: len(x), # Frequency
                                         'price': lambda x: x.sum()}) # Monetary Value

rfm_table.rename(columns={'accept_time': 'recency', 
                          'order_id': 'frequency', 
                          'price': 'monetary_value'}, inplace=True)
rfm_table.head()

Unnamed: 0_level_0,monetary_value,frequency,recency
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
48.0,58468.0,49.0,44
70.0,9052.0,13.0,164
102.0,12148.0,12.0,478
144.0,3912.0,3.0,455
151.0,1699.0,3.0,189


In [109]:
data.price[data.user_id == 48.0].sum()

58468.0

In [116]:
data.order_id[data.user_id == 48.0].count()

49

In [124]:
(now - dt.datetime(2017, 5, 18)).days

44