# Criação da base contendo apenas o NCM mais comum

Etapa de criação de uma nova base, a partir da base limpada anteriormente, contendo apenas os registros que pertencem ao NCM mais comum. Essa etapa tem como ojetivo reduzir a base para facilitar a execução dos modelos.

In [1]:
# Bibliotecas de manipulação de dados
import pandas as pd
import numpy as np

# Biblioteca para desligar os avisos do Python
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [2]:
# Lê a base limpada anteriormente
df = pd.read_csv(r'..\..\datasets\nfs_cleaned.csv', encoding='latin-1')
pd.set_option('display.max_columns', None)
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2065547 entries, 0 to 2065546
Data columns (total 19 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   nf_numero         2065547 non-null  int64  
 1   nf_item           2065547 non-null  int64  
 2   nf_datahora       2065547 non-null  object 
 3   nf_timestamp      2065547 non-null  int64  
 4   nf_valor_total    2065547 non-null  float64
 5   emit_nome         2065547 non-null  object 
 6   emit_cnpj         2065547 non-null  int64  
 7   emit_bairro       2065547 non-null  object 
 8   emit_municipio    2065547 non-null  object 
 9   emit_cep          2065547 non-null  int64  
 10  emit_lat          2065547 non-null  float64
 11  emit_long         2065547 non-null  float64
 12  prod_desc         2065547 non-null  object 
 13  prod_ncm          2065547 non-null  int64  
 14  prod_cfop         2065547 non-null  int64  
 15  prod_quant        2065547 non-null  float64
 16  

In [10]:
# Encontra o NCM mais comum
most_common_ncm = df['prod_ncm'].value_counts().idxmax()
most_common_ncm

30049099

In [11]:
# Seleciona as linhas que contém o NCM mais comum
df = df[df['prod_ncm'] == most_common_ncm]
df.head(5)

Unnamed: 0,nf_numero,nf_item,nf_datahora,nf_timestamp,nf_valor_total,emit_nome,emit_cnpj,emit_bairro,emit_municipio,emit_cep,emit_lat,emit_long,prod_desc,prod_ncm,prod_cfop,prod_quant,prod_unid,prod_valor_unit,prod_valor_total
494,3332,7,2016-04-13 08:52:33,1460537553,1415.34,dimedont distr. de medic. e equipamentos ltda,4064641000160,centro,cajazeiras,58900000,-6.889785,-38.557039,citalopram 20mg comp,30049099,5403,240.0,und,0.48,115.2
495,3332,8,2016-04-13 08:52:33,1460537553,1415.34,dimedont distr. de medic. e equipamentos ltda,4064641000160,centro,cajazeiras,58900000,-6.889785,-38.557039,citalopram 20mg comp,30049099,5403,168.0,und,0.48,80.64
512,2099,16,2016-08-25 08:41:21,1472114481,4490.26,biomed dist. hosp. e lab. nossa senhora da con...,7936090000176,remedios,cajazeiras,58900000,-6.889785,-38.557039,"soro fisiologico 0,9p 100ml cx c/80amp",30049099,5102,158.0,und,2.55,402.9
666,8361,1,2016-12-28 10:02:53,1482919373,8000.7,dimedont distr. de medic. e equipamentos ltda,4064641000160,centro,cajazeiras,58900000,-6.889785,-38.557039,soro fisiologico 500ml c/24,30049099,5403,70.0,cx,61.0,4270.0
667,8361,2,2016-12-28 10:02:53,1482919373,8000.7,dimedont distr. de medic. e equipamentos ltda,4064641000160,centro,cajazeiras,58900000,-6.889785,-38.557039,soro glicosado 5p 500ml,30049099,5403,200.0,fr,3.4,680.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76457 entries, 494 to 2065414
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   nf_numero         76457 non-null  int64  
 1   nf_item           76457 non-null  int64  
 2   nf_datahora       76457 non-null  object 
 3   nf_timestamp      76457 non-null  int64  
 4   nf_valor_total    76457 non-null  float64
 5   emit_nome         76457 non-null  object 
 6   emit_cnpj         76457 non-null  int64  
 7   emit_bairro       76457 non-null  object 
 8   emit_municipio    76457 non-null  object 
 9   emit_cep          76457 non-null  int64  
 10  emit_lat          76457 non-null  float64
 11  emit_long         76457 non-null  float64
 12  prod_desc         76457 non-null  object 
 13  prod_ncm          76457 non-null  int64  
 14  prod_cfop         76457 non-null  int64  
 15  prod_quant        76457 non-null  float64
 16  prod_unid         76457 non-null  ob

In [13]:
# Salva a base de dados
df.to_csv(r'..\..\datasets\nfs_cleaned_most_common_ncm.csv', index=False)