In [7]:
import pandas as pd
import numpy as np
import joblib
import numpy as np
import matplotlib.pyplot as plt
import sklearn.feature_extraction.text
import sklearn.naive_bayes
import sklearn.metrics
import sklearn.model_selection
import sklearn.pipeline
from sklearn.model_selection import train_test_split

In [4]:
import zipfile
path = "/content/"
file = "Base de Treino Pessoas.zip"
with zipfile.ZipFile(path + "Base de Treino Pessoas.zip", 'r') as zip_ref:
          zip_ref.extractall(path)

In [None]:
# colocar código para pegar a base do hub de dados

In [5]:
def PrepareData():
  """
  Function to prepare the dataset for model
  """

  df = pd.read_csv("Base de Treino Pessoas.csv")
  df['TIPO_PESSOA'] = df['TIPO_PESSOA'].replace('PESSOA FISICA (CPF)' ,'PF')
  df['TIPO_PESSOA'] = df['TIPO_PESSOA'].replace('PESSOA JURIDICA (CNPJ)' ,'PJ')
  df['TIPO_PESSOA'] = df['TIPO_PESSOA'].replace('PESSOA JURIDICA (CNP' ,'PJ')
  df= df[~df["TIPO_PESSOA"].isna()]
  df = df.drop_duplicates()
  dinamica = df.groupby(['NOME_PESSOA']).size().reset_index().rename(columns={0:'contagem'})
  remove_list = dinamica[dinamica["contagem"]>1]["NOME_PESSOA"]
  df = df[~df['NOME_PESSOA'].isin(remove_list)]

  print(df["TIPO_PESSOA"].unique())

  Y = df["TIPO_PESSOA"].values
  X = df["NOME_PESSOA"].values
  
  # handling dtype issuse, in NOME_PESSOA, there are numbers 
  x=[]
  y=[]
  for i, j in zip(X.tolist(),Y.tolist()):
    if type(i)!=str:
      continue 
    else:
      x.append(i)
      y.append(j)
  x= np.array(x)
  y = np.array(y)
  return x, y

def train_and_evaluate(train,ytrain):
    
    # Convert to bag of words
    count_vect = sklearn.feature_extraction.text.CountVectorizer(strip_accents='ascii', stop_words='english', lowercase=True, ngram_range=(1,1))
    X = count_vect.fit_transform(train)
    # Convert from occurrences to frequencies
    # Occurrence count is a good start but there is an issue: longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.
    # To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.
    transformer = sklearn.feature_extraction.text.TfidfTransformer()
    X = transformer.fit_transform(X)
    # Create a model
    model = sklearn.naive_bayes.MultinomialNB(alpha=0.3, fit_prior=True, class_prior=None)
    # Train the model
    model.fit(X, ytrain)
    # Save models
    joblib.dump(count_vect, 'vectorizer.jbl')
    joblib.dump(transformer, 'transformer.jbl')
    joblib.dump(model, 'model.jbl')
    # Evaluate on training data
    print('-- Training data --')
    predictions = model.predict(X)
    accuracy = sklearn.metrics.accuracy_score(ytrain, predictions)
    print('Accuracy: {0:.2f}'.format(accuracy * 100.0))
    print('Classification Report:')
    print(sklearn.metrics.classification_report(ytrain, predictions))
    print('')
    # Evaluate with 10-fold CV
    print('-- 10-fold CV --')
    predictions = sklearn.model_selection.cross_val_predict(model, X, ytrain, cv=10)
    accuracy = sklearn.metrics.accuracy_score(ytrain, predictions)
    print('Accuracy: {0:.2f}'.format(accuracy * 100.0))
    print('Classification Report:')
    print(sklearn.metrics.classification_report(ytrain, predictions))

# The main entry point for this module
def Start_Training():
    # Train and evaluate
    x,y = PrepareData()
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)
    train_and_evaluate(X_train,y_train)

In [8]:
# If you want to train then uncomment these below line
Start_Training()

['PF' 'PJ']
-- Training data --
Accuracy: 99.89
Classification Report:
              precision    recall  f1-score   support

          PF       1.00      1.00      1.00   1738637
          PJ       1.00      0.97      0.98     57303

    accuracy                           1.00   1795940
   macro avg       1.00      0.98      0.99   1795940
weighted avg       1.00      1.00      1.00   1795940


-- 10-fold CV --
Accuracy: 99.80
Classification Report:
              precision    recall  f1-score   support

          PF       1.00      1.00      1.00   1738637
          PJ       0.97      0.96      0.97     57303

    accuracy                           1.00   1795940
   macro avg       0.99      0.98      0.98   1795940
weighted avg       1.00      1.00      1.00   1795940



In [9]:
file = open("Classifier_PF_PJ.py", "w") 
file.write("""
import joblib
def is_PF_PJ(texto):
    vectorizer = joblib.load('vectorizer.jbl')
    transformer = joblib.load('transformer.jbl')
    model = joblib.load('model.jbl')
    X = vectorizer.transform([texto])
    X = transformer.transform(X)
    predictions = model.predict(X)
    return predictions[0]
""") 
file.close() 

In [10]:
import Classifier_PF_PJ

In [11]:
Classifier_PF_PJ.is_PF_PJ("Amilcar S Sampaio")

'PF'

In [12]:
Classifier_PF_PJ.is_PF_PJ("BCO BTG PACTUAL S.A.") 

'PJ'