In [75]:
import pandas as pd

In [83]:
path = '../../data/'
orderlines = pd.read_csv(path + 'orderlines.csv')
orders = pd.read_csv(path + 'orders.csv')
brands = pd.read_csv(path + 'brands.csv')
products = pd.read_csv(path + 'products.csv')

# Orders Pipeline

In [99]:
def start_pipeline(orders_data): 
    return orders_data.copy()

def set_types_orders(orders_data): 
    return orders_data.assign(created_date = pd.to_datetime(orders_data['created_date']))

def remove_NaN_orders(orders_data):
    return orders_data.dropna()

In [101]:
(
orders
    .pipe(start_pipeline)
    .pipe(set_types_orders)
    .pipe(remove_NaN_orders)
)

Unnamed: 0,order_id,created_date,total_paid,state
0,241319,2017-01-02 13:35:40,44.99,Cancelled
1,241423,2017-11-06 13:10:02,136.15,Completed
2,242832,2017-12-31 17:40:03,15.76,Completed
3,243330,2017-02-16 10:59:38,84.98,Completed
4,243784,2017-11-24 13:35:19,157.86,Cancelled
...,...,...,...,...
226904,527397,2018-03-14 13:56:38,42.99,Place Order
226905,527398,2018-03-14 13:57:25,42.99,Shopping Basket
226906,527399,2018-03-14 13:57:34,141.58,Shopping Basket
226907,527400,2018-03-14 13:57:41,19.98,Shopping Basket


# Orderlines Pipeline

In [109]:
orderlines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB


In [137]:
def start_pipeline(orderlines_data): 
    return orderlines_data.copy()

def set_types_orderlines(orderlines_data): 
    return orderlines_data.assign(date = pd.to_datetime(orderlines_data['date']))

def remove_dots(orderlines_data):
    return orderlines_data.assign(unit_price_nd = orderlines_data['unit_price'].str.replace('\.','', regex=True))

def temp_prices(orderlines_data): 
    return (orderlines_data
           .assign(euro = orderlines_data['unit_price_nd'].str[:-2])
           .assign(eurocent = orderlines_data['unit_price_nd'].str[-2:]))

def new_price(orderlines_data):
    return (orderlines_data
           .assign(price = orderlines_data['euro'] + '.' + orderlines_data['eurocent']))

def change_price_to_numeric(orderlines_data):
    return orderlines_data.assign(price = pd.to_numeric(orderlines_data['price']))

def dropping_columns(orderlines_data): 
    return orderlines_data.drop(columns = ['product_id', 'unit_price_nd', 'euro', 'eurocent', 'unit_price'])

In [138]:
(
orderlines
    .pipe(start_pipeline)
    .pipe(set_types_orderlines)
    .pipe(remove_dots)
    .pipe(temp_price)
    .pipe(new_price)
    .pipe(change_price_to_numeric)
    .pipe(dropping_columns)
)

Unnamed: 0,id,id_order,product_quantity,sku,date,price
0,1119109,299539,1,OTT0133,2017-01-01 00:07:19,18.99
1,1119110,299540,1,LGE0043,2017-01-01 00:19:45,399.00
2,1119111,299541,1,PAR0071,2017-01-01 00:20:57,474.05
3,1119112,299542,1,WDT0315,2017-01-01 00:51:40,68.39
4,1119113,299543,1,JBL0104,2017-01-01 01:06:38,23.74
...,...,...,...,...,...,...
293978,1650199,527398,1,JBL0122,2018-03-14 13:57:25,42.99
293979,1650200,527399,1,PAC0653,2018-03-14 13:57:34,141.58
293980,1650201,527400,2,APP0698,2018-03-14 13:57:41,9.99
293981,1650202,527388,1,BEZ0204,2018-03-14 13:58:01,19.99


In [141]:
orderlines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB
