In [66]:
from faker import Faker
import faker
import random
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics, model_selection
from sklearn.externals import joblib

In [9]:
class_skips = ['automotive', 'barcode', 'color', 'company', 'date_time', 'job', 'lorem', 'misc', 'profile', 'python']
method_skips = ['ascii_company_email',
                'ascii_email',
                'ascii_free_email',
                'ascii_safe_email',
                'bban',
                'bank_country',
                'bothify',
                'building_number',
                'city_suffix',
                'company_email',
                'country_code',
                'currency_name',
                'currency_code',
                'street_address',
                'credit_card_number',
                'credit_card_expire',
                'credit_card_full',
                'credit_card_provider',
                'credit_card_security_code',
                'cryptocurrency',
                'cryptocurrency_code',
                'cryptocurrency_name',
                'currency',
                'domain_word',
                'domain_name',
                'file_extension',
                'file_name',
                'first_name_female',
                'first_name_male',
                'first_name',
                'free_email',
                'free_email_domain',
                'geo_coordinate',
                'hexify',
                'iban',
                'image_url',
                'isbn10',
                'isbn13',
                'ipv4_network_class',
                'ipv4_private',
                'ipv4_public',
                'last_name_female',
                'last_name_male',
                'last_name',
                'latitude',
                'longitude',
                'lexify',
                'linux_processor',
                'linux_platform_token',
                'mac_platform_token',
                'mac_processor',
                'mime_type',
                'msisdn',
                'name_female',
                'name_male',
                'numerify',
                'postcode',
                'prefix',
                'prefix_male',
                'prefix_female',
                'random_digit',
                'random_digit_not_null',
                'random_digit_not_null_or_empty',
                'random_digit_or_empty',
                'random_element',
                'random_int',
                'random_letter',
                'random_lowercase_letter',
                'random_number',
                'random_sample',
                'random_sample_unique',
                'random_uppercase_letter',
                'randomize_nb_elements',
                'safe_email',
                'slug',
                'street_name',
                'street_suffix',
                'suffix',
                'suffix_female',
                'suffix_male',
                'tld',
                'uri',
                'unix_device',
                'unix_partition',
                'uri_extension',
                'uri_page',
                'windows_platform_token',
                'safari',
                'firefox',
                'chrome',
                'opera',
                'internet_explorer',
                'user_name'
               ]

def faker_examples():
    provider_class = []
    providers = []
    examples = []
    f = Faker()
    for provider in dir(faker.providers):
        num = random.randint(15000,20000)
        if provider[0].islower():
            if provider in class_skips:
                continue
            try:
                for fake in dir(getattr(faker.providers, provider).Provider):
                    if fake in method_skips:
                        continue
                    if fake[0].islower():
                        for i in range(num):
                            try:
                                examples.append(str(getattr(f, fake)()))
                                providers.append(fake)
                                provider_class.append(provider)
                            except Exception as e:
                                continue
            except AttributeError:
                continue
    return provider_class, providers, examples

In [10]:
provider_class, providers, examples = faker_examples()

In [185]:
def words_and_char_bigrams(text):
    yield text
    for i in range(len(text)-2):
         yield text[i:i+2]

In [188]:
text_clf = Pipeline([('vect', CountVectorizer(analyzer=words_and_char_bigrams)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression())
                    ])

In [189]:
X_train, X_test, y_train, y_test = train_test_split(examples, providers, 
                                                    test_size=0.33, random_state=919)

In [190]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer=<function words_and_char_bigrams at 0x10b083950>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproc...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [191]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.9860149491054325

In [225]:
# testers = ['121-71-8228']
testers = ['https://www.google.com', 'https://www.facebook.com/login', 'http://docs.readthedocs.io', 'https://bleacherreport.com/mlb/yankees']
# testers = ['michaelahirsch@gmail.com', 'lh685@georgetown.edu', 'michael.hirsch@elastic.co', 'a@queryunderstanding.ai']
#testers = ['364 south 1st street, brooklyn, ny, USA, 10012', '45 west 27th street', '2167 33rd st, apt 4c, astoria, ny, 11105', '1 world trade center, ny, ny', ]
# testers = ['london', 'new york', 'paris', 'reno']
# testers = ['Florida', 'New York', 'Illinois', 'New Hampshire', 'California', 'Texas', 'tennessee', 'indiana', 'kentucky']
# testers = ['/usr/bin/sudo', '/usr/lib/jvm/java-8-oracle/jre/bin/java', '/usr/lib/jvm/java-1.8-openjdk/jre/bin/java', '/bin/cat', '/bin/bash', '/usr/bin/env', '/usr/bin/python2.7', '/usr/bin/python3.5']
# testers = ['michael hirsch', 'christopher serna', 'linda huber', 'jenna hirsch', 'steve dodson']
classes = []
for tester in testers:
    X = text_clf.predict_proba([tester])[0]
    Y = text_clf.classes_
    xy = [{y: x} for x, y in sorted(zip(X,Y), reverse=True)]
    for th in xy[:3]:
        classes.append(th)

for t in testers:
    print(t)
for aa in sorted(classes, key=lambda x: list(x.values())[0], reverse=True)[:6]:
    print(aa)

https://www.google.com
https://www.facebook.com/login
http://docs.readthedocs.io
https://bleacherreport.com/mlb/yankees
{'url': 0.9965090533389461}
{'url': 0.9718786787305261}
{'url': 0.9249570250106766}
{'url': 0.5510485766521571}
{'file_path': 0.436548769015368}
{'file_path': 0.061472157282970535}


In [193]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

     address       1.00      1.00      1.00      5187
        city       0.99      0.99      0.99      5130
     country       0.99      1.00      0.99      5049
       email       1.00      1.00      1.00      6184
   file_path       1.00      1.00      1.00      5343
        ipv4       0.98      1.00      0.99      6141
        ipv6       1.00      1.00      1.00      6076
 mac_address       1.00      1.00      1.00      6126
        name       1.00      0.99      1.00      5729
phone_number       0.96      0.83      0.89      4975
         ssn       0.89      0.97      0.93      5882
    uri_path       1.00      1.00      1.00      6174
         url       1.00      1.00      1.00      6114
  user_agent       1.00      1.00      1.00      5761

 avg / total       0.99      0.99      0.99     79871



In [143]:
metrics.confusion_matrix(y_test, predicted)

array([[5187,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   0, 3429,  510,    0,    0,    0,    0,    0, 1091,    0,    0,
         100,    0,    0],
       [   0,  569, 3300,    0,    0,    0,    0,    0,  927,    0,    0,
         253,    0,    0],
       [   0,    0,    0, 6184,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   0,    0,    0,    0, 5325,    0,    0,    0,    0,    0,    0,
          16,    2,    0],
       [   0,    0,    0,    0,    0, 6141,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   0,    0,    0,    0,    0,    0, 6076,    0,    0,    0,    0,
           0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 6126,    0,    0,    0,
           0,    0,    0],
       [   0,  762,  356,    0,    0,    0,    0,    0, 4608,    0,    0,
           3,    0,    0],
       [   0,    0,    0,    0,    0,  117,    0,    0,    0, 4022,  836,
           0,    

In [57]:
filename = 'faker.classifier'
joblib.dump(text_clf, filename)

['faker.classifier']

In [58]:
new_clf = joblib.load(filename)

In [67]:
new_pred  = model_selection.cross_val_predict(new_clf, examples, providers, cv=10)

In [68]:
metrics.accuracy_score(providers, new_pred)

0.9535885898913775

In [78]:
print(metrics.classification_report(providers, new_pred))

              precision    recall  f1-score   support

     address       1.00      1.00      1.00     15430
        city       0.99      0.61      0.76     15430
     country       1.00      1.00      1.00     15430
       email       1.00      1.00      1.00     18615
   file_path       1.00      1.00      1.00     15905
        ipv4       0.92      0.98      0.95     18615
        ipv6       0.72      1.00      0.83     18615
 mac_address       1.00      1.00      1.00     18615
        name       1.00      0.99      1.00     17356
phone_number       0.99      0.80      0.88     15304
         ssn       0.89      0.92      0.91     17914
    uri_path       1.00      1.00      1.00     18615
         url       1.00      1.00      1.00     18615
  user_agent       1.00      1.00      1.00     17572

 avg / total       0.96      0.95      0.95    242031



In [186]:
mike = CountVectorizer(analyzer=words_and_char_bigrams)
mike.fit(X_test)

CountVectorizer(analyzer=<function words_and_char_bigrams at 0x10b083950>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [187]:
mike.vocabulary_

{'http://walton.com/': 60130,
 'ht': 58988,
 'tt': 68305,
 'tp': 68255,
 'p:': 66384,
 ':/': 35028,
 '//': 1630,
 '/w': 6690,
 'wa': 68582,
 'al': 52092,
 'lt': 65338,
 'to': 68224,
 'on': 66328,
 'n.': 66056,
 '.c': 1608,
 'co': 55204,
 'om': 66319,
 '29.173.249.7': 18270,
 '29': 18267,
 '9.': 33264,
 '.1': 1597,
 '17': 13446,
 '73': 29063,
 '3.': 18776,
 '.2': 1598,
 '24': 16813,
 '49': 23096,
 'Lake Madisonville': 40293,
 'La': 40110,
 'ak': 52089,
 'ke': 64728,
 'e ': 56487,
 ' M': 49,
 'Ma': 40755,
 'ad': 51898,
 'di': 56368,
 'is': 63981,
 'so': 67628,
 'nv': 66240,
 'vi': 68496,
 'il': 63951,
 'll': 65277,
 'http://www.fisher.com/': 60508,
 'ww': 68952,
 'w.': 68570,
 '.f': 1611,
 'fi': 58389,
 'sh': 67469,
 'he': 58836,
 'er': 57360,
 'r.': 66819,
 '146.63.45.38': 12421,
 '14': 12121,
 '46': 22464,
 '6.': 25982,
 '.6': 1602,
 '63': 26649,
 '.4': 1600,
 '45': 22272,
 '5.': 23607,
 '.3': 1599,
 '252.217.5.251': 17385,
 '25': 17260,
 '52': 24022,
 '2.': 15026,
 '21': 15498,
 '7.':