# Featurologists

> Engineering Labs #2: Feature Store for ML

## Install

`pip install -U git+https://github.com/artemlops/featurologists.git@master`

## Usage

In [None]:
#hide
%load_ext autoreload
%autoreload 2

### 01. Load and split dataset

In [None]:
from featurologists.data.load_split import load_csv, split_offline_online

In [None]:
df = load_csv('../data/data.csv')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [None]:
import datetime

DATE_SPLIT = datetime.date(2011,10,1)
df_offline, df_online = split_offline_online(df, DATE_SPLIT)

display(df_offline.shape)
display(df_offline['InvoiceDate'].min())
display(df_offline['InvoiceDate'].max())

display(df_online.shape)
display(df_online['InvoiceDate'].min())
display(df_online['InvoiceDate'].max())

(370931, 8)

Timestamp('2010-12-01 08:26:00')

Timestamp('2011-09-30 17:22:00')

(170978, 8)

Timestamp('2011-10-02 10:32:00')

Timestamp('2011-12-09 12:50:00')

In [None]:
NB_NAME = '01_data_split_offline_online'
OUT_DATA_PATH = f'../data/{NB_NAME}'
!mkdir -p {OUT_DATA_PATH}

df_offline.to_csv(f'{OUT_DATA_PATH}/no_live_data.csv', index=False)
df_online.to_csv(f'{OUT_DATA_PATH}/raw_live_data.csv', index=False)
del df, df_offline, df_online;

### 02. Clean dataset rows

In [None]:
from featurologists.data.load_split import load_csv
from featurologists.data.clean_rows import clean_data_rows

In [None]:
df = load_csv('../data/01_data_split_offline_online/no_live_data.csv')
df_cleaned = clean_data_rows(df)

KeyboardInterrupt: 

In [None]:
NB_NAME = '02_data_clean_rows'
OUT_DATA_PATH = f'../data/{NB_NAME}'
!mkdir -p {OUT_DATA_PATH}

df_cleaned.to_csv(f'{OUT_DATA_PATH}/no_live_data__cleaned.csv', index=False)
del df, df_cleaned;

### 03. Analyse keywords in product descriptions

In [None]:
from featurologists.data.load_split import load_csv
from featurologists.data.analyse_keywords import build_product_list, build_keywords_matrix

In [None]:
df_cleaned = load_csv('../data/02_data_clean_rows/no_live_data__cleaned.csv')
list_products = build_product_list(df)
list_products[:5]

In [None]:
THRESHOLD = [0, 1, 2, 3, 5, 10]
X = build_keywords_matrix(df_cleaned, list_products, THRESHOLD)
X.head()

In [None]:
NB_NAME = '03_data_compute_description_keywords'
OUT_DATA_PATH = f'../data/{NB_NAME}'
!mkdir -p {OUT_DATA_PATH}

X.to_csv(f'{OUT_DATA_PATH}/no_live_data__cleaned__keywords.csv', index=False)
del df_cleaned, list_products, X;