In [55]:
from faker import Faker
import faker
import random
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.externals import joblib

In [9]:
class_skips = ['automotive', 'barcode', 'color', 'company', 'date_time', 'job', 'lorem', 'misc', 'profile', 'python']
method_skips = ['ascii_company_email',
                'ascii_email',
                'ascii_free_email',
                'ascii_safe_email',
                'bban',
                'bank_country',
                'bothify',
                'building_number',
                'city_suffix',
                'company_email',
                'country_code',
                'currency_name',
                'currency_code',
                'street_address',
                'credit_card_number',
                'credit_card_expire',
                'credit_card_full',
                'credit_card_provider',
                'credit_card_security_code',
                'cryptocurrency',
                'cryptocurrency_code',
                'cryptocurrency_name',
                'currency',
                'domain_word',
                'domain_name',
                'file_extension',
                'file_name',
                'first_name_female',
                'first_name_male',
                'first_name',
                'free_email',
                'free_email_domain',
                'geo_coordinate',
                'hexify',
                'iban',
                'image_url',
                'isbn10',
                'isbn13',
                'ipv4_network_class',
                'ipv4_private',
                'ipv4_public',
                'last_name_female',
                'last_name_male',
                'last_name',
                'latitude',
                'longitude',
                'lexify',
                'linux_processor',
                'linux_platform_token',
                'mac_platform_token',
                'mac_processor',
                'mime_type',
                'msisdn',
                'name_female',
                'name_male',
                'numerify',
                'postcode',
                'prefix',
                'prefix_male',
                'prefix_female',
                'random_digit',
                'random_digit_not_null',
                'random_digit_not_null_or_empty',
                'random_digit_or_empty',
                'random_element',
                'random_int',
                'random_letter',
                'random_lowercase_letter',
                'random_number',
                'random_sample',
                'random_sample_unique',
                'random_uppercase_letter',
                'randomize_nb_elements',
                'safe_email',
                'slug',
                'street_name',
                'street_suffix',
                'suffix',
                'suffix_female',
                'suffix_male',
                'tld',
                'uri',
                'unix_device',
                'unix_partition',
                'uri_extension',
                'uri_page',
                'windows_platform_token',
                'safari',
                'firefox',
                'chrome',
                'opera',
                'internet_explorer',
                'user_name'
               ]

def faker_examples():
    provider_class = []
    providers = []
    examples = []
    f = Faker()
    for provider in dir(faker.providers):
        num = random.randint(15000,20000)
        if provider[0].islower():
            if provider in class_skips:
                continue
            try:
                for fake in dir(getattr(faker.providers, provider).Provider):
                    if fake in method_skips:
                        continue
                    if fake[0].islower():
                        for i in range(num):
                            try:
                                examples.append(str(getattr(f, fake)()))
                                providers.append(fake)
                                provider_class.append(provider)
                            except Exception as e:
                                continue
            except AttributeError:
                continue
    return provider_class, providers, examples

In [10]:
provider_class, providers, examples = faker_examples()

In [11]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression())
                    ])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(examples, providers, 
                                                    test_size=0.33, random_state=919)

In [22]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [23]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.9497064015725357

In [47]:
# testers = ['michaelahirsch@gmail.com', 'lh685@georgetown.edu', 'michael.hirsch@elastic.co']
# testers = ['364 south 1st street, brooklyn, ny, USA, 10012', '45 west 27th street', '2167 33rd st, apt 4c, astoria, ny, 11105', '1 world trade center, ny, ny', ]
testers = ['Florida', 'New York', 'Illinois', 'New Hampshire', 'California', 'Texas', 'tennessee', 'indiana', 'kentucky']
classes = []
for tester in testers:
    X = text_clf.predict_proba([tester])[0]
    Y = text_clf.classes_
    xy = [{y: x} for x, y in sorted(zip(X,Y), reverse=True)]
    for th in xy[:5]:
        classes.append(th)
print(sorted(classes, key=lambda x: list(x.values())[0], reverse=True))
        # print(sorted(classes, key=lambda k: k['val'] ))

[{'city': 0.8668028457216694}, {'city': 0.8543483348952056}, {'ipv6': 0.41376536254285323}, {'ipv6': 0.41376536254285323}, {'ipv6': 0.41376536254285323}, {'ipv6': 0.41376536254285323}, {'ipv6': 0.41376536254285323}, {'ipv6': 0.41376536254285323}, {'ipv6': 0.41376536254285323}, {'city': 0.15986923406839768}, {'city': 0.15986923406839768}, {'city': 0.15986923406839768}, {'city': 0.15986923406839768}, {'city': 0.15986923406839768}, {'city': 0.15986923406839768}, {'city': 0.15986923406839768}, {'phone_number': 0.1508661426093504}, {'phone_number': 0.1508661426093504}, {'phone_number': 0.1508661426093504}, {'phone_number': 0.1508661426093504}, {'phone_number': 0.1508661426093504}, {'phone_number': 0.1508661426093504}, {'phone_number': 0.1508661426093504}, {'ssn': 0.1201362272949441}, {'ssn': 0.1201362272949441}, {'ssn': 0.1201362272949441}, {'ssn': 0.1201362272949441}, {'ssn': 0.1201362272949441}, {'ssn': 0.1201362272949441}, {'ssn': 0.1201362272949441}, {'address': 0.11751602525847793}, {'

In [48]:
Counter(predicted).most_common()

[('ipv6', 8623),
 ('ipv4', 6654),
 ('email', 6184),
 ('uri_path', 6174),
 ('url', 6114),
 ('mac_address', 6101),
 ('ssn', 6011),
 ('user_agent', 5764),
 ('name', 5648),
 ('file_path', 5344),
 ('address', 5167),
 ('country', 5059),
 ('phone_number', 3980),
 ('city', 3048)]

In [49]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

     address       1.00      1.00      1.00      5187
        city       0.99      0.59      0.74      5130
     country       1.00      1.00      1.00      5049
       email       1.00      1.00      1.00      6184
   file_path       1.00      1.00      1.00      5343
        ipv4       0.90      0.98      0.94      6141
        ipv6       0.70      1.00      0.82      6076
 mac_address       1.00      1.00      1.00      6126
        name       1.00      0.98      0.99      5729
phone_number       0.99      0.79      0.88      4975
         ssn       0.88      0.90      0.89      5882
    uri_path       1.00      1.00      1.00      6174
         url       1.00      1.00      1.00      6114
  user_agent       1.00      1.00      1.00      5761

 avg / total       0.96      0.95      0.95     79871



In [50]:
metrics.confusion_matrix(y_test, predicted)

array([[5163,   18,    0,    0,    1,    0,    0,    0,    5,    0,    0,
           0,    0,    0],
       [   0, 3019,    0,    0,    0,    0, 2111,    0,    0,    0,    0,
           0,    0,    0],
       [   0,    0, 5049,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   0,    0,    0, 6184,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   1,    0,    0,    0, 5342,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [   0,    0,    0,    0,    0, 6015,    0,    0,    0,    0,  126,
           0,    0,    0],
       [   0,    0,    0,    0,    0,    1, 6050,    0,    0,   19,    6,
           0,    0,    0],
       [   0,    0,    0,    0,    0,    9,    0, 6101,    0,    0,   16,
           0,    0,    0],
       [   3,   11,   10,    0,    1,    0,   61,    0, 5643,    0,    0,
           0,    0,    0],
       [   0,    0,    0,    0,    0,   93,  401,    0,    0, 3927,  551,
           0,    

In [57]:
filename = 'faker.classifier'
joblib.dump(text_clf, filename)

['faker.classifier']

In [58]:
new_clf = joblib.load(filename)