# DataPot Usage Examples

In [27]:
import pandas as pd
from __future__ import print_function
import sys
import bz2
import time
import xgboost as xgb
from sklearn.model_selection import cross_val_score

import datapot as dp
from datapot.utils import csv_to_jsonlines
from datapot.datasets import fetch_imdb, fetch_job_salary

## Dataset with timestamp features extraction.

In [26]:
transactions = pd.read_csv('../data/transactions.csv')
transactions.head()

Unnamed: 0,merchant_id,latitude,longitude,real_transaction_dttm,record_date
0,178,0.0,0.0,9:34:47,9:30:36
1,178,55.055995,82.912991,17:49:50,17:54:24
2,178,0.0,0.0,9:34:47,9:31:22
3,178,55.056034,82.912734,17:49:50,17:43:01
4,178,55.056034,82.912734,17:49:50,17:45:17


Creating the DataPot object.

In [16]:
import datapot as dp
datapot = dp.DataPot()

In [17]:
from datapot.utils import csv_to_jsonlines

csv_to_jsonlines('../data/transactions.csv', '../data/transactions.jsonlines')

In [18]:
data_trns = open('../data/transactions.jsonlines')
data_trns.readline()

'{"merchant_id":178,"latitude":0.0,"longitude":0.0,"real_transaction_dttm":"9:34:47","record_date":"9:30:36"}\n'

Let's call the fit method. It automatically finds appropriate transformers for the fields of jsonlines file. The parameter 'limit' means how many objects will be used to detect the right transformers.

In [19]:
datapot.detect(data_trns, limit=100)

DataPot class instance
 - number of features without transformation: 5
 - number of new features: 13
features to transform: 
	('merchant_id', [SVDOneHotTransformer, NumericTransformer])
	('latitude', [NumericTransformer])
	('longitude', [NumericTransformer])
	('real_transaction_dttm', [TimestampTransformer])
	('record_date', [TimestampTransformer])

In [20]:
t0 = time.time()
datapot.fit(data_trns, verbose=True)
print('fit time:', time.time()-t0)

fit transformers...
fit: ('merchant_id', [SVDOneHotTransformer, NumericTransformer])
fit: ('latitude', [NumericTransformer])
fit: ('longitude', [NumericTransformer])
fit: ('real_transaction_dttm', [TimestampTransformer])
fit: ('record_date', [TimestampTransformer])
fit transformers...OK
num of new features: 23
fit time: 4.280209302902222


In [21]:
datapot

DataPot class instance
 - number of features without transformation: 5
 - number of new features: 23
features to transform: 
	('merchant_id', [SVDOneHotTransformer, NumericTransformer])
	('latitude', [NumericTransformer])
	('longitude', [NumericTransformer])
	('real_transaction_dttm', [TimestampTransformer])
	('record_date', [TimestampTransformer])

Let's **remove** the SVDOneHotTransformer

In [22]:
datapot.remove_transformer('merchant_id', 0)

In [23]:
t0 = time.time()
df_trns = datapot.transform(data_trns)
print('transform time:', time.time()-t0)

transform time: 39.660483837127686


In [24]:
df_trns.head()

Unnamed: 0,merchant_id,latitude,longitude,real_transaction_dttm_timestamp_unixtime,real_transaction_dttm_timestamp_week_day,real_transaction_dttm_timestamp_month_day,real_transaction_dttm_timestamp_hour,real_transaction_dttm_timestamp_minute,record_date_timestamp_unixtime,record_date_timestamp_week_day,record_date_timestamp_month_day,record_date_timestamp_hour,record_date_timestamp_minute
0,178.0,0.0,0.0,1491633000.0,5.0,8.0,9.0,34.0,1491633000.0,5.0,8.0,9.0,30.0
1,178.0,55.055996,82.912991,1491663000.0,5.0,8.0,17.0,49.0,1491663000.0,5.0,8.0,17.0,54.0
2,178.0,0.0,0.0,1491633000.0,5.0,8.0,9.0,34.0,1491633000.0,5.0,8.0,9.0,31.0
3,178.0,55.056034,82.912734,1491663000.0,5.0,8.0,17.0,49.0,1491663000.0,5.0,8.0,17.0,43.0
4,178.0,55.056034,82.912734,1491663000.0,5.0,8.0,17.0,49.0,1491663000.0,5.0,8.0,17.0,45.0


## Bag of Words Meets Bags of Popcorn

### Usage example for unstructured textual bzip2-compressed data

datapot.fit method subsamples the data to detect language and choose corresponding stopwords and stemming.

For each review datapot.transform generates an SVD-compressed 12-dimensional tfidf-vector representation.


In [1]:
import datapot as dp
from datapot.datasets import fetch_imdb

In [4]:
fetch_imdb()
data_imdb = bz2.BZ2File('data/imdb.jsonlines.bz2')
datapot_imdb = dp.DataPot()

In [6]:
t0 = time.time()
datapot_imdb.detect(data_imdb)
print('detect time:', time.time()-t0)
datapot_imdb

detect time: 0.048979997634887695


DataPot class instance
 - number of features without transformation: 3
 - number of new features: Unknown
features to transform: 
	('sentiment', [SVDOneHotTransformer, NumericTransformer])
	('review', [TfidfTransformer])

In [7]:
datapot_imdb.remove_transformer('sentiment', 0)

In [9]:
t0 = time.time()
datapot_imdb.fit(data_imdb, verbose=True)

fit transformers...
fit: ('sentiment', [NumericTransformer])
fit: ('review', [TfidfTransformer])
0.395061731338501
2.1329188346862793
2.3272857666015625
2.378422975540161
fit transformers...OK
num of new features: 13


DataPot class instance
 - number of features without transformation: 3
 - number of new features: 13
features to transform: 
	('sentiment', [NumericTransformer])
	('review', [TfidfTransformer])

In [10]:
print('fit time:', time.time()-t0)

fit time: 4.254590272903442


In [11]:
t0 = time.time()
df_imdb = datapot_imdb.transform(data_imdb)
print('transform time:', time.time()-t0)

transform time: 2.475766181945801


In [12]:
df_imdb.head()

Unnamed: 0,sentiment,review_0,review_1,review_2,review_3,review_4,review_5,review_6,review_7,review_8,review_9,review_10,review_11
0,1.0,0.561568,0.037617,0.066563,0.012515,-0.122175,0.083467,-0.055668,-0.003447,-0.08897,0.009006,0.082585,0.007215
1,1.0,0.353305,-0.110642,0.007999,0.003605,0.00619,0.018251,0.037428,-0.01711,-0.031891,-0.063658,-0.073276,-0.016203
2,0.0,0.420766,-0.104584,-0.157112,-0.064523,-0.04874,-0.026885,-0.018352,0.061155,0.163171,0.07315,-0.048731,0.080086
3,0.0,0.536519,-0.069268,-0.124055,-0.074944,-0.019102,-0.005256,0.055192,-0.026643,-0.023917,-0.044321,-0.076855,-0.008434
4,1.0,0.41337,-0.10436,-0.059966,0.053426,-0.063245,-0.009945,-0.040887,-0.021137,0.097938,0.062758,0.002043,0.054385


In [13]:
X = df_imdb.drop(['sentiment'], axis=1)
y = df_imdb['sentiment']

In [14]:
model = xgb.XGBClassifier()
cv_score = cross_val_score(model, X, y, cv=5)
assert all(i > 0.5 for i in cv_score), 'Low score!'
print('Cross-val score:', cv_score)

model.fit(X, y)
fi = model.feature_importances_

print('Feature importance:')
print(*(list(zip(X.columns, fi))), sep='\n')

Cross-val score: [ 0.73726274  0.74525475  0.704       0.75975976  0.74074074]
Feature importance:
('review_0', 0.017569546)
('review_1', 0.079062961)
('review_2', 0.09956076)
('review_3', 0.092240117)
('review_4', 0.071742311)
('review_5', 0.058565155)
('review_6', 0.038067348)
('review_7', 0.14055637)
('review_8', 0.15812592)
('review_9', 0.035139091)
('review_10', 0.060029283)
('review_11', 0.14934114)


## Job Salary Prediction

### Usage example for unstructured textual bzip2-compressed data


In [2]:
from datapot.datasets import fetch_job_salary

fetch_job_salary()
data_job = bz2.BZ2File('data/job.jsonlines.bz2')
datapot_job = dp.DataPot()

In [3]:
t0 = time.time()
datapot_job.detect(data_job)
print('detect time:', time.time()-t0)
datapot_job

detect time: 0.0410611629486084


DataPot class instance
 - number of features without transformation: 9
 - number of new features: Unknown
features to transform: 
	('Id', [NumericTransformer])
	('FullDescription', [TfidfTransformer])
	('ContractType', [SVDOneHotTransformer])
	('ContractTime', [SVDOneHotTransformer])
	('Category', [SVDOneHotTransformer])
	('SalaryNormalized', [NumericTransformer])

In [4]:
t0 = time.time()
datapot_job.fit(data_job, verbose=True)
print('fit time:', time.time()-t0)

fit transformers...
fit: ('Id', [NumericTransformer])
fit: ('FullDescription', [TfidfTransformer])
0.17565703392028809
1.0644011497497559
1.2400639057159424
1.2862319946289062
fit: ('ContractType', [SVDOneHotTransformer])
fit: ('ContractTime', [SVDOneHotTransformer])
fit: ('Category', [SVDOneHotTransformer])
fit: ('SalaryNormalized', [NumericTransformer])
fit transformers...OK
num of new features: 38
fit time: 1.9022538661956787


In [5]:
t0 = time.time()
df_job = datapot_job.transform(data_job)
print('transform time:', time.time()-t0)

transform time: 1.5304019451141357


In [8]:
print(df_job.columns)
print(df_job.shape)
df_job.head()

Index(['Id', 'FullDescription_0', 'FullDescription_1', 'FullDescription_2',
       'FullDescription_3', 'FullDescription_4', 'FullDescription_5',
       'FullDescription_6', 'FullDescription_7', 'FullDescription_8',
       'FullDescription_9', 'FullDescription_10', 'FullDescription_11',
       'ContractType_None', 'ContractType_full_time', 'ContractType_part_time',
       'ContractTime_permanent', 'ContractTime_None', 'ContractTime_contract',
       'Category_Engineering Jobs', 'Category_HR & Recruitment Jobs',
       'Category_Accounting & Finance Jobs',
       'Category_Healthcare & Nursing Jobs', 'Category_Other/General Jobs',
       'Category_Hospitality & Catering Jobs', 'Category_IT Jobs',
       'Category_Customer Services Jobs', 'Category_Travel Jobs',
       'Category_Sales Jobs', 'Category_Manufacturing Jobs',
       'Category_Teaching Jobs', 'Category_Creative & Design Jobs',
       'Category_Trade & Construction Jobs', 'Category_Property Jobs',
       'Category_Admin Jobs',

Unnamed: 0,Id,FullDescription_0,FullDescription_1,FullDescription_2,FullDescription_3,FullDescription_4,FullDescription_5,FullDescription_6,FullDescription_7,FullDescription_8,...,Category_Sales Jobs,Category_Manufacturing Jobs,Category_Teaching Jobs,Category_Creative & Design Jobs,Category_Trade & Construction Jobs,Category_Property Jobs,Category_Admin Jobs,Category_Legal Jobs,Category_Retail Jobs,SalaryNormalized
0,12612628.0,0.061321,-0.001374,0.024432,-0.006674,-0.005117,-0.020105,0.049415,-0.094435,0.006445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25000.0
1,12612830.0,0.321996,-0.064041,0.128569,-0.027485,0.006713,-0.058885,0.030574,-0.144783,0.015681,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30000.0
2,12612844.0,0.227718,-0.015093,0.082972,-0.050471,-0.018522,-0.055125,0.033922,-0.095247,0.013809,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30000.0
3,12613049.0,0.046773,-0.00659,0.025494,-0.024585,-0.018206,-0.022787,0.024405,-0.102111,-0.011417,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27500.0
4,12613647.0,0.033728,-0.004213,0.018586,-0.011409,-0.014566,-0.013282,0.039354,-0.06989,0.011876,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25000.0


In [7]:
X_job = df_job.drop(['SalaryNormalized', 'Id'], axis=1)
y_job = pd.qcut(df_job['SalaryNormalized'].values, q=2, labels=[0,1]).ravel()

model = xgb.XGBClassifier()
cv_score_job = cross_val_score(model, X_job, y_job, cv=5)
print('Cross-val score:', cv_score_job)
assert all(i > 0.5 for i in cv_score_job), 'Low score!'

model.fit(X_job, y_job)
fi_job = model.feature_importances_

print('Feature importance:')
print(*(list(zip(X_job.columns, fi_job))), sep='\n')


Cross-val score: [ 0.75561097  0.81296758  0.7625      0.71679198  0.74686717]
Feature importance:
('FullDescription_0', 0.096563011)
('FullDescription_1', 0.14075287)
('FullDescription_2', 0.09492635)
('FullDescription_3', 0.068739772)
('FullDescription_4', 0.029459901)
('FullDescription_5', 0.057283141)
('FullDescription_6', 0.062193125)
('FullDescription_7', 0.13420622)
('FullDescription_8', 0.10474632)
('FullDescription_9', 0.032733224)
('FullDescription_10', 0.065466449)
('FullDescription_11', 0.049099836)
('ContractType_None', 0.0032733225)
('ContractType_full_time', 0.021276595)
('ContractType_part_time', 0.0098199677)
('ContractTime_permanent', 0.0032733225)
('ContractTime_None', 0.0)
('ContractTime_contract', 0.0)
('Category_Engineering Jobs', 0.0)
('Category_HR & Recruitment Jobs', 0.0)
('Category_Accounting & Finance Jobs', 0.0)
('Category_Healthcare & Nursing Jobs', 0.01309329)
('Category_Other/General Jobs', 0.0)
('Category_Hospitality & Catering Jobs', 0.0)
('Category_IT 