# Complex Data Type Detection using Data Labeler component of Data Profiler

In [1]:
import sys
sys.path.insert(0, '../..')
import os
import dataprofiler as dp
import json
import pandas as pd

## Structured data

In [2]:
data = dp.Data("../data/structured/aws_honeypot_marx_geo.csv")
df_data = data.data
df_data.head()

Unnamed: 0,datetime,host,src,proto,type,srcport,destport,srcip,locale,localeabbr,postalcode,latitude,longitude,owner,comment,int_col
0,3/3/13 21:53,groucho-oregon,1032051418.0,TCP,,6000,1433,61.131.218.218,Jiangxi Sheng,36,,28.55,115.9333,,He my polite be object oh change. Consider no ...,9464.0
1,3/3/13 21:57,groucho-oregon,1347834426.0,UDP,,5270,5060,80.86.82.58,,,,51.0,9.0,,,3731.0
2,3/3/13 21:58,groucho-oregon,2947856490.0,TCP,,2489,1080,175.180.184.106,Taipei,,,25.0392,121.525,,Of on affixed civilly moments promise explain ...,3963.0
3,3/3/13 21:58,,,UDP,,43235,1900,,Oregon,OR,97124.0,45.5848,-122.9117,,,1422.0
4,3/3/13 21:58,groucho-singapore,3587648279.0,TCP,,56577,80,213.215.43.23,,,,48.86,2.35,,Affronting everything discretion men now own d...,9271.0


In [3]:
# set option to run only data labeler
profile_options = dp.ProfilerOptions()
profile_options.set({"text.is_enabled": False, 
                     "int.is_enabled": False, 
                     "float.is_enabled": False, 
                     "order.is_enabled": False, 
                     "category.is_enabled": False, 
                     "datetime.is_enabled": False,})

profile = dp.Profiler(data, profiler_options=profile_options)

  .format(variable_path))
  .format(variable_path))
  .format(variable_path))
  return func(self, *args, **kwargs)
 62%|██████▎   | 10/16 [00:02<00:00,  9.44it/s]



 75%|███████▌  | 12/16 [00:02<00:00, 10.24it/s]



100%|██████████| 16/16 [00:02<00:00,  6.27it/s]


In [4]:
# get the prediction from data profiler
def get_structured_results(results):
    columns = []
    predictions = []
    for col in results['data_stats']:
        columns.append(col)
        predictions.append(results['data_stats'][col]['data_label'])

    df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions})
    return df_results

results = profile.report()    
print(get_structured_results(results))

        Column             Prediction
0     datetime  DATETIME|PHONE_NUMBER
1         host                UNKNOWN
2          src       BAN|PHONE_NUMBER
3        proto                UNKNOWN
4         type                INTEGER
5      srcport                ADDRESS
6     destport                INTEGER
7        srcip                   IPV4
8       locale                UNKNOWN
9   localeabbr                INTEGER
10  postalcode                INTEGER
11    latitude                  FLOAT
12   longitude                  FLOAT
13       owner                   None
14     comment                UNKNOWN
15     int_col                  FLOAT


### Train data labeler from scratch

In [5]:
# the column 'comment' has been changed to UNKNOWN, as data labeler requires at least one column with label BACKGROUND
data = dp.Data("../data/structured/aws_honeypot_marx_geo_retrain.csv")

# split data to training and test set
split_ratio = 0.2
df = data.data.sample(frac=1).reset_index(drop=True)
data_train = df[:int((1 - split_ratio) * len(df))]
data_test = df[int((1 - split_ratio) * len(df)):]

# train new data labeler with column names as labels
if not os.path.exists('data_labeler_saved'):
    os.makedirs('data_labeler_saved')

data_labeler = dp.train_structured_labeler(
    data=data_train,
    save_dirpath="data_labeler_saved",
    epochs=10
)



EPOCH 0 (3s), loss: 4.576456 - acc: 0.102176 - f1_score 0.102176 -- val_f1: 0.043341 - val_precision: 0.030926 - val_recall 0.128533
EPOCH 1 (0s), loss: 3.118322 - acc: 0.159318 - f1_score 0.159318 -- val_f1: 0.057850 - val_precision: 0.053500 - val_recall 0.146838
EPOCH 2 (0s), loss: 2.257035 - acc: 0.277035 - f1_score 0.277035 -- val_f1: 0.114242 - val_precision: 0.116730 - val_recall 0.202152
EPOCH 3 (0s), loss: 1.674754 - acc: 0.491612 - f1_score 0.491612 -- val_f1: 0.217278 - val_precision: 0.229349 - val_recall 0.286557
EPOCH 4 (0s), loss: 1.319068 - acc: 0.626576 - f1_score 0.626576 -- val_f1: 0.256808 - val_precision: 0.354095 - val_recall 0.322086
EPOCH 5 (0s), loss: 1.068586 - acc: 0.712753 - f1_score 0.712753 -- val_f1: 0.264931 - val_precision: 0.500542 - val_recall 0.320535
EPOCH 6 (0s), loss: 0.901625 - acc: 0.755659 - f1_score 0.755659 -- val_f1: 0.294112 - val_precision: 0.536594 - val_recall 0.338628
EPOCH 7 (0s), loss: 0.770896 - acc: 0.788141 - f1_score 0.788141 -- v

In [6]:
# predict with data labeler object
profile_options.set({'data_labeler.data_labeler_object': data_labeler})
profile = dp.Profiler(data_test, profiler_options=profile_options)

# get the prediction from data profiler
results = profile.report()
print(get_structured_results(results))

  .format(variable_path))
  .format(variable_path))
  .format(variable_path))
  return func(self, *args, **kwargs)
100%|██████████| 16/16 [00:00<00:00, 29.83it/s]


        Column Prediction
0     datetime        src
1         host       host
2          src        src
3        proto      proto
4         type      proto
5      srcport        src
6     destport        src
7        srcip      srcip
8       locale        src
9   localeabbr      proto
10  postalcode        src
11    latitude        src
12   longitude   latitude
13       owner       None
14     UNKNOWN    UNKNOWN
15     int_col    int_col


In [7]:
# predict with data labeler loaded from path
profile_options.set({'data_labeler.data_labeler_dirpath': 'data_labeler_saved'})
profile = dp.Profiler(data_test, profiler_options=profile_options)

# get the prediction from data profiler
results = profile.report()
print(get_structured_results(results))

  .format(variable_path))
  .format(variable_path))
  .format(variable_path))
  return func(self, *args, **kwargs)
100%|██████████| 16/16 [00:00<00:00, 31.06it/s]

        Column Prediction
0     datetime        src
1         host       host
2          src        src
3        proto      proto
4         type      proto
5      srcport        src
6     destport        src
7        srcip      srcip
8       locale        src
9   localeabbr      proto
10  postalcode        src
11    latitude        src
12   longitude   latitude
13       owner       None
14     UNKNOWN    UNKNOWN
15     int_col    int_col





## Unstructured data

In [8]:
# load data
data = dp.Data("../data/unstructured/email-enron-sample")
print(data.data[0])



Message-ID: <14332367.1075858794078.JavaMail.evans@thyme>
Date: Mon, 15 Oct 2001 10:51:17 -0700 (PDT)
From: w..white@enron.com
To: john.postlethwaite@enron.com
Subject: RE: PGE
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: White, Stacey W. </O=ENRON/OU=NA/CN=RECIPIENTS/CN=SWHITE>
X-To: Postlethwaite, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=JPOSTLE>
X-cc: 
X-bcc: 
X-Folder: \SWHITE (Non-Privileged)\Sent Items
X-Origin: White-S
X-FileName: SWHITE (Non-Privileged).pst

All I ever saw was the e-mail from the Office of the Chair.

Stacey

 -----Original Message-----
From:   Postlethwaite, John  
Sent:   Monday, October 15, 2001 12:47 PM
To:     White, Stacey W.
Subject:        PGE

Have you heard any more regarding the PGE sale? It's funny, here nobody is talking about it. I guess that means that it's no big deal here, but you think they would have send something.

By the way, when I saw Casey, that girl is getting skinnier by the day. 

In [9]:
data_labeler = dp.DataLabeler(labeler_type='unstructured')

# make predictions and get labels per character
predictions = data_labeler.predict(data)

# display results
print(predictions['pred'])



[array([ 1.,  1.,  1., ..., 16.,  1.,  1.])]


In [10]:
# convert prediction to word format and ner format
# Set the output to the NER format (start position, end position, label)
data_labeler.set_params(
    { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } } 
)

# make predictions and get labels per character
predictions = data_labeler.predict(data)

# display results
print('\n')
print('=======================Prediction======================\n')
for pred in predictions['pred'][0]:
    print('{}: {}'.format(data.data[0][pred[0]: pred[1]], pred[2]))
    print('--------------------------------------------------------')




<14332367: QUANTITY
--------------------------------------------------------
evans@thyme>: EMAIL_ADDRESS
--------------------------------------------------------
Mon, 15: DATE
--------------------------------------------------------
Oct 2001 10: DATETIME
--------------------------------------------------------
-0700: TIME
--------------------------------------------------------
white@enron.com: EMAIL_ADDRESS
--------------------------------------------------------
john.postlethwaite@enron.com: EMAIL_ADDRESS
--------------------------------------------------------
7bit: QUANTITY
--------------------------------------------------------
White, Stacey W: PERSON
--------------------------------------------------------
</O=ENRON/OU=NA/CN=RECIPIENTS/CN=SWHITE>: HASH_OR_KEY
--------------------------------------------------------
Postlethwaite, John: PERSON
--------------------------------------------------------
</O=ENRON/OU=NA/CN=RECIPIENTS/CN=JPOSTLE>: HASH_OR_KEY
-----------------------