# DataPot Usage Examples

In [1]:
import datapot as dp
from datapot import datasets

In [2]:
import pandas as pd
from __future__ import print_function
import sys
import bz2
import time
import xgboost as xgb
from sklearn.model_selection import cross_val_score

import datapot as dp
from datapot.utils import csv_to_jsonlines

## Dataset with timestamp features extraction.
Convert CSV file to JSON lines

In [3]:
transactions = pd.read_csv('../data/transactions.csv')
transactions.head()

Unnamed: 0,merchant_id,latitude,longitude,real_transaction_dttm,record_date
0,178,0.0,0.0,9:34:47,9:30:36
1,178,55.055995,82.912991,17:49:50,17:54:24
2,178,0.0,0.0,9:34:47,9:31:22
3,178,55.056034,82.912734,17:49:50,17:43:01
4,178,55.056034,82.912734,17:49:50,17:45:17


Creating the DataPot object.

In [4]:
datapot = dp.DataPot()

In [5]:
from datapot.utils import csv_to_jsonlines

csv_to_jsonlines('../data/transactions.csv', '../data/transactions.jsonlines')

In [6]:
data_trns = open('../data/transactions.jsonlines')
data_trns.readline()

'{"merchant_id":178,"latitude":0.0,"longitude":0.0,"real_transaction_dttm":"9:34:47","record_date":"9:30:36"}\n'

Let's call the fit method. It automatically finds appropriate transformers for the fields of jsonlines file. The parameter 'limit' means how many objects will be used to detect the right transformers.

In [7]:
datapot.detect(data_trns, limit=100)

DataPot class instance
 - number of features without transformation: 5
 - number of new features: 13
features to transform: 
	('merchant_id', [SVDOneHotTransformer, NumericTransformer])
	('latitude', [NumericTransformer])
	('longitude', [NumericTransformer])
	('real_transaction_dttm', [TimestampTransformer])
	('record_date', [TimestampTransformer])

In [8]:
t0 = time.time()
datapot.fit(data_trns, verbose=True)
print('fit time:', time.time()-t0)

fit transformers...
fit: ('merchant_id', [SVDOneHotTransformer, NumericTransformer])
fit: ('latitude', [NumericTransformer])
fit: ('longitude', [NumericTransformer])
fit: ('real_transaction_dttm', [TimestampTransformer])
fit: ('record_date', [TimestampTransformer])
fit transformers...OK
num of new features: 23
fit time: 4.39128303527832


In [9]:
datapot

DataPot class instance
 - number of features without transformation: 5
 - number of new features: 23
features to transform: 
	('merchant_id', [SVDOneHotTransformer, NumericTransformer])
	('latitude', [NumericTransformer])
	('longitude', [NumericTransformer])
	('real_transaction_dttm', [TimestampTransformer])
	('record_date', [TimestampTransformer])

Let's **remove** the SVDOneHotTransformer

In [10]:
datapot.remove_transformer('merchant_id', 0)

DataPot class instance
 - number of features without transformation: 5
 - number of new features: 23
features to transform: 
	('merchant_id', [NumericTransformer])
	('latitude', [NumericTransformer])
	('longitude', [NumericTransformer])
	('real_transaction_dttm', [TimestampTransformer])
	('record_date', [TimestampTransformer])

In [11]:
t0 = time.time()
df_trns = datapot.transform(data_trns)
print('transform time:', time.time()-t0)

transform time: 39.0145058631897


  return pd.DataFrame(data=np.hstack(columns), columns=names).convert_objects(convert_numeric=True)


In [12]:
df_trns.head()

Unnamed: 0,merchant_id,latitude,longitude,real_transaction_dttm_timestamp_unixtime,real_transaction_dttm_timestamp_week_day,real_transaction_dttm_timestamp_month_day,real_transaction_dttm_timestamp_hour,real_transaction_dttm_timestamp_minute,record_date_timestamp_unixtime,record_date_timestamp_week_day,record_date_timestamp_month_day,record_date_timestamp_hour,record_date_timestamp_minute
0,178.0,0.0,0.0,1496299000.0,3.0,1.0,9.0,34.0,1496299000.0,3.0,1.0,9.0,30.0
1,178.0,55.055996,82.912991,1496329000.0,3.0,1.0,17.0,49.0,1496329000.0,3.0,1.0,17.0,54.0
2,178.0,0.0,0.0,1496299000.0,3.0,1.0,9.0,34.0,1496299000.0,3.0,1.0,9.0,31.0
3,178.0,55.056034,82.912734,1496329000.0,3.0,1.0,17.0,49.0,1496328000.0,3.0,1.0,17.0,43.0
4,178.0,55.056034,82.912734,1496329000.0,3.0,1.0,17.0,49.0,1496328000.0,3.0,1.0,17.0,45.0


## Bag of Words Meets Bags of Popcorn

### Usage example for unstructured textual bzip2-compressed data

datapot.fit method subsamples the data to detect language and choose corresponding stopwords and stemming.

For each review datapot.transform generates an SVD-compressed 12-dimensional tfidf-vector representation.


In [13]:
import datapot as dp
from datapot import datasets

Load data from datapot.datasets

In [14]:
data_imdb = datasets.load_imdb() 

Or load directly from file

In [15]:
data_imdb = bz2.BZ2File('data/imdb.jsonlines.bz2')

In [16]:
datapot_imdb = dp.DataPot()

In [17]:
t0 = time.time()
datapot_imdb.detect(data_imdb)
print('detect time:', time.time()-t0)
datapot_imdb

detect time: 0.040107011795043945


DataPot class instance
 - number of features without transformation: 3
 - number of new features: Unknown
features to transform: 
	('id', [NumericTransformer])
	('sentiment', [SVDOneHotTransformer, NumericTransformer])
	('review', [TfidfTransformer])

In [18]:
datapot_imdb.remove_transformer('sentiment', 0)

DataPot class instance
 - number of features without transformation: 3
 - number of new features: Unknown
features to transform: 
	('id', [NumericTransformer])
	('sentiment', [NumericTransformer])
	('review', [TfidfTransformer])

In [19]:
t0 = time.time()
datapot_imdb.fit(data_imdb, verbose=True)

fit transformers...
fit: ('id', [NumericTransformer])
fit: ('sentiment', [NumericTransformer])
fit: ('review', [TfidfTransformer])
fit transformers...OK
num of new features: 14


DataPot class instance
 - number of features without transformation: 3
 - number of new features: 14
features to transform: 
	('id', [NumericTransformer])
	('sentiment', [NumericTransformer])
	('review', [TfidfTransformer])

In [20]:
print('fit time:', time.time()-t0)

fit time: 3.7446699142456055


In [21]:
t0 = time.time()
df_imdb = datapot_imdb.transform(data_imdb)
print('transform time:', time.time()-t0)

transform time: 2.948928117752075


  return pd.DataFrame(data=np.hstack(columns), columns=names).convert_objects(convert_numeric=True)


In [22]:
df_imdb.head()

Unnamed: 0,id,sentiment,review_0,review_1,review_2,review_3,review_4,review_5,review_6,review_7,review_8,review_9,review_10,review_11
0,58148.0,1.0,0.033797,0.066174,0.045831,0.0,0.030886,0.117464,0.039373,0.034706,0.013286,0.045712,0.10478,0.013761
1,23819.0,1.0,0.063158,0.0,0.021557,0.005732,0.019623,0.02175,0.042203,0.076573,0.014431,0.0,0.013012,0.0
2,77593.0,0.0,0.096245,0.018315,0.00315,0.007268,0.0,0.0,0.020369,0.0,0.172927,0.005657,0.0,0.0
3,36304.0,0.0,0.125528,0.035612,0.011671,0.006704,0.0,0.027897,0.082309,0.05404,0.054605,0.00145,0.00777,0.0
4,94958.0,1.0,0.063425,0.000287,0.010751,0.039666,0.000372,0.034857,0.001859,0.01649,0.118161,0.045672,0.021046,0.001137


In [23]:
X = df_imdb.drop(['sentiment'], axis=1)
y = df_imdb['sentiment']

In [24]:
model = xgb.XGBClassifier()
cv_score = cross_val_score(model, X, y, cv=5)
assert all(i > 0.5 for i in cv_score), 'Low score!'
print('Cross-val score:', cv_score)

model.fit(X, y)
fi = model.feature_importances_

print('Feature importance:')
print(*(list(zip(X.columns, fi))), sep='\n')

Cross-val score: [ 0.72527473  0.73126873  0.731       0.73773774  0.71371371]
Feature importance:
('id', 0.17647059)
('review_0', 0.055028465)
('review_1', 0.066413663)
('review_2', 0.055028465)
('review_3', 0.066413663)
('review_4', 0.062618598)
('review_5', 0.05882353)
('review_6', 0.060721062)
('review_7', 0.081593931)
('review_8', 0.13851993)
('review_9', 0.064516127)
('review_10', 0.062618598)
('review_11', 0.051233396)


## Job Salary Prediction

### Usage example for unstructured textual bzip2-compressed data


In [26]:
from datapot import datasets

data_job = datasets.load_job_salary()

# Or load from file%: 
# data_job = bz2.BZ2File('datapot/data/job.jsonlines.bz2')

In [27]:
datapot_job = dp.DataPot()

In [28]:
t0 = time.time()
datapot_job.detect(data_job)
print('detect time:', time.time()-t0)
datapot_job

detect time: 0.03791189193725586


DataPot class instance
 - number of features without transformation: 9
 - number of new features: Unknown
features to transform: 
	('Id', [NumericTransformer])
	('FullDescription', [TfidfTransformer])
	('ContractType', [SVDOneHotTransformer])
	('ContractTime', [SVDOneHotTransformer])
	('Category', [SVDOneHotTransformer])
	('SalaryNormalized', [NumericTransformer])

In [29]:
t0 = time.time()
datapot_job.fit(data_job, verbose=True)
print('fit time:', time.time()-t0)

fit transformers...
fit: ('Id', [NumericTransformer])
fit: ('FullDescription', [TfidfTransformer])
fit: ('ContractType', [SVDOneHotTransformer])
fit: ('ContractTime', [SVDOneHotTransformer])
fit: ('Category', [SVDOneHotTransformer])
fit: ('SalaryNormalized', [NumericTransformer])
fit transformers...OK
num of new features: 38
fit time: 2.2404630184173584


In [30]:
t0 = time.time()
df_job = datapot_job.transform(data_job)
print('transform time:', time.time()-t0)

transform time: 1.7749860286712646


  return pd.DataFrame(data=np.hstack(columns), columns=names).convert_objects(convert_numeric=True)


In [31]:
print(df_job.columns)
print(df_job.shape)
df_job.head()

Index(['Id', 'FullDescription_0', 'FullDescription_1', 'FullDescription_2',
       'FullDescription_3', 'FullDescription_4', 'FullDescription_5',
       'FullDescription_6', 'FullDescription_7', 'FullDescription_8',
       'FullDescription_9', 'FullDescription_10', 'FullDescription_11',
       'ContractType_None', 'ContractType_full_time', 'ContractType_part_time',
       'ContractTime_permanent', 'ContractTime_None', 'ContractTime_contract',
       'Category_Engineering Jobs', 'Category_HR & Recruitment Jobs',
       'Category_Accounting & Finance Jobs',
       'Category_Healthcare & Nursing Jobs', 'Category_Other/General Jobs',
       'Category_Hospitality & Catering Jobs', 'Category_IT Jobs',
       'Category_Customer Services Jobs', 'Category_Travel Jobs',
       'Category_Sales Jobs', 'Category_Manufacturing Jobs',
       'Category_Teaching Jobs', 'Category_Creative & Design Jobs',
       'Category_Trade & Construction Jobs', 'Category_Property Jobs',
       'Category_Admin Jobs',

Unnamed: 0,Id,FullDescription_0,FullDescription_1,FullDescription_2,FullDescription_3,FullDescription_4,FullDescription_5,FullDescription_6,FullDescription_7,FullDescription_8,...,Category_Sales Jobs,Category_Manufacturing Jobs,Category_Teaching Jobs,Category_Creative & Design Jobs,Category_Trade & Construction Jobs,Category_Property Jobs,Category_Admin Jobs,Category_Legal Jobs,Category_Retail Jobs,SalaryNormalized
0,12612628.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144199,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25000.0
1,12612830.0,0.004768,0.0,0.0,0.009348,0.010136,0.016442,0.0,0.226506,0.017381,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30000.0
2,12612844.0,0.037061,0.000533,0.0,0.0,0.00346,0.000891,0.0,0.103209,0.014262,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30000.0
3,12613049.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137301,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27500.0
4,12613647.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111573,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25000.0


In [32]:
X_job = df_job.drop(['SalaryNormalized', 'Id'], axis=1)
y_job = pd.qcut(df_job['SalaryNormalized'].values, q=2, labels=[0,1]).ravel()

model = xgb.XGBClassifier()
cv_score_job = cross_val_score(model, X_job, y_job, cv=5)
print('Cross-val score:', cv_score_job)
assert all(i > 0.5 for i in cv_score_job), 'Low score!'

model.fit(X_job, y_job)
fi_job = model.feature_importances_

print('Feature importance:')
print(*(list(zip(X_job.columns, fi_job))), sep='\n')


Cross-val score: [ 0.73566085  0.8478803   0.73        0.73934837  0.72932331]
Feature importance:
('FullDescription_0', 0.050541516)
('FullDescription_1', 0.13718411)
('FullDescription_2', 0.074007221)
('FullDescription_3', 0.045126352)
('FullDescription_4', 0.03068592)
('FullDescription_5', 0.090252705)
('FullDescription_6', 0.11371841)
('FullDescription_7', 0.093862817)
('FullDescription_8', 0.070397109)
('FullDescription_9', 0.079422385)
('FullDescription_10', 0.095667869)
('FullDescription_11', 0.05595668)
('ContractType_None', 0.0)
('ContractType_full_time', 0.016245488)
('ContractType_part_time', 0.016245488)
('ContractTime_permanent', 0.010830325)
('ContractTime_None', 0.0036101083)
('ContractTime_contract', 0.0)
('Category_Engineering Jobs', 0.0)
('Category_HR & Recruitment Jobs', 0.0)
('Category_Accounting & Finance Jobs', 0.0)
('Category_Healthcare & Nursing Jobs', 0.0)
('Category_Other/General Jobs', 0.0)
('Category_Hospitality & Catering Jobs', 0.0)
('Category_IT Jobs', 0.

In [33]:
# README Example:

In [36]:
import datapot as dp 
datapot = dp.DataPot()

In [35]:
f = open('../data/job.jsonlines', 'r')

In [37]:
datapot.detect(f, limit=100)

DataPot class instance
 - number of features without transformation: 9
 - number of new features: Unknown
features to transform: 
	('Id', [NumericTransformer])
	('FullDescription', [TfidfTransformer])
	('ContractType', [SVDOneHotTransformer])
	('ContractTime', [SVDOneHotTransformer])
	('Company', [SVDOneHotTransformer])
	('Category', [SVDOneHotTransformer])
	('SalaryNormalized', [NumericTransformer])

In [39]:
datapot.fit(f)

DataPot class instance
 - number of features without transformation: 9
 - number of new features: 82
features to transform: 
	('Id', [NumericTransformer])
	('FullDescription', [TfidfTransformer])
	('ContractType', [SVDOneHotTransformer])
	('ContractTime', [SVDOneHotTransformer])
	('Company', [SVDOneHotTransformer])
	('Category', [SVDOneHotTransformer])
	('SalaryNormalized', [NumericTransformer])

In [40]:
df = datapot.transform(f)

  return pd.DataFrame(data=np.hstack(columns), columns=names).convert_objects(convert_numeric=True)


In [42]:
datapot

DataPot class instance
 - number of features without transformation: 9
 - number of new features: 82
features to transform: 
	('Id', [NumericTransformer])
	('FullDescription', [TfidfTransformer])
	('ContractType', [SVDOneHotTransformer])
	('ContractTime', [SVDOneHotTransformer])
	('Company', [SVDOneHotTransformer])
	('Category', [SVDOneHotTransformer])
	('SalaryNormalized', [NumericTransformer])

In [41]:
df 

Unnamed: 0,Id,FullDescription_0,FullDescription_1,FullDescription_2,FullDescription_3,FullDescription_4,FullDescription_5,FullDescription_6,FullDescription_7,FullDescription_8,...,Category_Sales Jobs,Category_Manufacturing Jobs,Category_Teaching Jobs,Category_Creative & Design Jobs,Category_Trade & Construction Jobs,Category_Property Jobs,Category_Admin Jobs,Category_Legal Jobs,Category_Retail Jobs,SalaryNormalized
0,12612628.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.150140,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25000.0
1,12612830.0,0.013116,0.000000,0.000000,0.007216,0.010779,0.016545,0.000000,0.221731,0.016948,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30000.0
2,12612844.0,0.040393,0.000186,0.000000,0.000000,0.003482,0.000264,0.000000,0.097974,0.011785,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30000.0
3,12613049.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.142848,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27500.0
4,12613647.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.116838,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25000.0
5,13179816.0,0.000000,0.000000,0.000000,0.004856,0.000000,0.000000,0.000000,0.242765,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25000.0
6,14131336.0,0.000243,0.000000,0.006632,0.005179,0.000000,0.001550,0.000631,0.277994,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,75000.0
7,14663196.0,0.051950,0.008785,0.008139,0.000000,0.007445,0.029633,0.000000,0.130873,0.000572,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22000.0
8,14663197.0,0.047322,0.003395,0.000000,0.013564,0.007465,0.024145,0.000000,0.116203,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23000.0
9,15395797.0,0.022046,0.005672,0.001950,0.000506,0.000000,0.000000,0.019242,0.177823,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,85000.0


In [44]:
df.shape

(2000, 82)