In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from statsmodels.robust.scale import mad as median_abs_deviation
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv(r'..\..\datasets\nfs_cleaned_most_common_ncm.csv', encoding='latin-1')
pd.set_option('display.max_columns', None)
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76457 entries, 0 to 76456
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   nf_numero         76457 non-null  int64  
 1   nf_item           76457 non-null  int64  
 2   nf_datahora       76457 non-null  object 
 3   nf_timestamp      76457 non-null  int64  
 4   nf_valor_total    76457 non-null  float64
 5   emit_nome         76457 non-null  object 
 6   emit_cnpj         76457 non-null  int64  
 7   emit_bairro       76457 non-null  object 
 8   emit_municipio    76457 non-null  object 
 9   emit_cep          76457 non-null  int64  
 10  emit_lat          76457 non-null  float64
 11  emit_long         76457 non-null  float64
 12  prod_desc         76457 non-null  object 
 13  prod_ncm          76457 non-null  int64  
 14  prod_cfop         76457 non-null  int64  
 15  prod_quant        76457 non-null  float64
 16  prod_unid         76457 non-null  object

In [3]:
df = df[[
    'nf_timestamp',
    'emit_lat',
    'emit_long',
    'prod_quant',
    'prod_valor_unit',
]]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76457 entries, 0 to 76456
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   nf_timestamp     76457 non-null  int64  
 1   emit_lat         76457 non-null  float64
 2   emit_long        76457 non-null  float64
 3   prod_quant       76457 non-null  float64
 4   prod_valor_unit  76457 non-null  float64
dtypes: float64(4), int64(1)
memory usage: 2.9 MB


In [4]:
robust_scaler = RobustScaler()
robust_scaler.fit(df)
normalized_data = robust_scaler.transform(df)
df_rob_norm = pd.DataFrame(data=normalized_data, columns=df.columns)
df_rob_norm.to_csv('..\..\datasets\df_rob_norm_most_common_ncm.csv', index=False)
df_rob_norm.head(5)

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit
0,-0.527934,0.897163,-1.188215,1.567568,-0.3205
1,-0.527934,0.897163,-1.188215,1.081081,-0.3205
2,0.270203,0.897163,-1.188215,1.013514,-0.270693
3,1.015115,0.897163,-1.188215,0.418919,1.135707
4,1.015115,0.897163,-1.188215,1.297297,-0.250241


In [5]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df)
df_minmax_norm = pd.DataFrame(data=normalized_data, columns=df.columns)
df_minmax_norm.to_csv('..\..\datasets\df_minmax_norm_most_common_ncm.csv', index=False)
df_minmax_norm.head(5)

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit
0,0.276635,0.7153,0.871734,0.000799,4.1e-05
1,0.276635,0.7153,0.871734,0.000559,4.1e-05
2,0.64594,0.7153,0.871734,0.000526,0.000219
3,0.990616,0.7153,0.871734,0.000233,0.005231
4,0.990616,0.7153,0.871734,0.000666,0.000292


In [6]:
z_scores = {}

for column in df_rob_norm.columns:
    column_data = df_rob_norm[column].values
    mean = np.mean(column_data)
    std = np.std(column_data)
    z_scores[column] = (column_data - mean) / std

df_rob_zscores = pd.DataFrame(z_scores)
df_rob_zscores.to_csv('..\..\datasets\df_rob_zscores_most_common_ncm.csv', index=False)
df_rob_zscores

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit
0,-0.885006,0.173138,-0.230206,-0.075961,-0.271612
1,-0.885006,0.173138,-0.230206,-0.093301,-0.271612
2,0.443764,0.173138,-0.230206,-0.095710,-0.257285
3,1.683922,0.173138,-0.230206,-0.116903,0.147255
4,1.683922,0.173138,-0.230206,-0.085595,-0.251402
...,...,...,...,...,...
76452,1.457786,-3.930407,-2.737205,0.006403,-0.228493
76453,1.457786,-3.930407,-2.737205,-0.132797,-0.228493
76454,-1.835384,-4.517759,-1.365819,-0.119311,-0.166341
76455,-1.835384,-4.517759,-1.365819,0.058904,-0.220257


In [7]:
z_scores = {}

for column in df_minmax_norm.columns:
    column_data = df_minmax_norm[column].values
    mean = np.mean(column_data)
    std = np.std(column_data)
    z_scores[column] = (column_data - mean) / std

df_minmax_zscores = pd.DataFrame(z_scores)
df_minmax_zscores.to_csv('..\..\datasets\df_minmax_zscores_most_common_ncm.csv', index=False)
df_minmax_zscores

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit
0,-0.885006,0.173138,-0.230206,-0.075961,-0.271612
1,-0.885006,0.173138,-0.230206,-0.093301,-0.271612
2,0.443764,0.173138,-0.230206,-0.095710,-0.257285
3,1.683922,0.173138,-0.230206,-0.116903,0.147255
4,1.683922,0.173138,-0.230206,-0.085595,-0.251402
...,...,...,...,...,...
76452,1.457786,-3.930407,-2.737205,0.006403,-0.228493
76453,1.457786,-3.930407,-2.737205,-0.132797,-0.228493
76454,-1.835384,-4.517759,-1.365819,-0.119311,-0.166341
76455,-1.835384,-4.517759,-1.365819,0.058904,-0.220257


In [8]:
z_scores = {}

for column in df_rob_norm.columns:
    column_data = df_rob_norm[column].values
    median = np.median(column_data)
    mad = median_abs_deviation(column_data)
    z_scores[column] = (column_data - median) / (0.6745 * mad)

df_rob_rzscores = pd.DataFrame(z_scores)
df_rob_rzscores.to_csv('..\..\datasets\df_rob_rzscores_most_common_ncm.csv', index=False)
df_rob_rzscores

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit
0,-1.055105,1.684141,-2.693062,33.142353,-1.009076
1,-1.055105,1.684141,-2.693062,22.856796,-1.009076
2,0.540016,1.684141,-2.693062,21.428246,-0.852260
3,2.028763,1.684141,-2.693062,8.857008,3.575703
4,2.028763,1.684141,-2.693062,27.428155,-0.787867
...,...,...,...,...,...
76452,1.757298,-68.129725,-13.549798,81.998754,-0.537113
76453,1.757298,-68.129725,-13.549798,-0.571420,-0.537113
76454,-2.195985,-78.122380,-7.610913,7.428459,0.143180
76455,-2.195985,-78.122380,-7.610913,113.141138,-0.446963


In [9]:
df_norm = df.copy()

minmax_scaler = MinMaxScaler()
df_norm['nf_timestamp'] = minmax_scaler.fit_transform(df[['nf_timestamp']])

robust_scaler = RobustScaler()
df_norm['emit_lat'] = robust_scaler.fit_transform(df[['emit_lat']])
df_norm['emit_long'] = robust_scaler.fit_transform(df[['emit_long']])

df_norm['prod_quant'] = np.log(df['prod_quant'] + 1)
df_norm['prod_valor_unit'] = np.log(df['prod_valor_unit'] + 1)

df_norm.to_csv('..\..\datasets\df_norm_most_common_ncm.csv', index=False)

df_norm.head(5)

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit
0,0.276635,0.897163,-1.188215,5.484797,0.392042
1,0.276635,0.897163,-1.188215,5.129899,0.392042
2,0.64594,0.897163,-1.188215,5.068904,1.266948
3,0.990616,0.897163,-1.188215,4.26268,4.127134
4,0.990616,0.897163,-1.188215,5.303305,1.481605


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76457 entries, 0 to 76456
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   nf_timestamp     76457 non-null  float64
 1   emit_lat         76457 non-null  float64
 2   emit_long        76457 non-null  float64
 3   prod_quant       76457 non-null  float64
 4   prod_valor_unit  76457 non-null  float64
dtypes: float64(5)
memory usage: 2.9 MB


In [11]:
z_scores = {}

for column in df_norm.columns:
    column_data = df_norm[column].values
    mean = np.mean(column_data)
    std = np.std(column_data)
    z_scores[column] = (column_data - mean) / std

df_zscores = pd.DataFrame(z_scores)
df_zscores.to_csv('..\..\datasets\df_zscores_most_common_ncm.csv', index=False)
df_zscores

Unnamed: 0,nf_timestamp,emit_lat,emit_long,prod_quant,prod_valor_unit
0,-0.885006,0.173138,-0.230206,0.987516,-1.371881
1,-0.885006,0.173138,-0.230206,0.840297,-1.371881
2,0.443764,0.173138,-0.230206,0.814996,-0.816732
3,1.683922,0.173138,-0.230206,0.480557,0.998128
4,1.683922,0.173138,-0.230206,0.912230,-0.680527
...,...,...,...,...,...
76452,1.457786,-3.930407,-2.737205,1.353965,-0.324613
76453,1.457786,-3.930407,-2.737205,-0.620062,-0.324613
76454,-1.835384,-4.517759,-1.365819,0.417585,0.165425
76455,-1.835384,-4.517759,-1.365819,1.485742,-0.233538


In [None]:
z_scores = {}

for column in df_norm.columns:
    column_data = df_norm[column].values
    median = np.median(column_data)
    mad = median_abs_deviation(column_data)
    z_scores[column] = (column_data - median) / (0.6745 * mad)

df_rzscores = pd.DataFrame(z_scores)
df_rzscores.to_csv('..\..\datasets\df_rzscores_most_common_ncm.csv', index=False)
df_rzscores