In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# File paths
diario_file = "../data_input/diario/Diario_Vendas_Atual_2025 oficial2.xlsx"
mescladas_file = "../data_input/diario/Mescladas2.xlsx"


In [2]:

print("Reading data from Excel files...")

# Read from Diario_Vendas_Atual_2025 oficial2, sheet=BD HT
try:
    diario_df = pd.read_excel(diario_file, sheet_name='BD HT')
    print(f"✓ Diario data loaded: {len(diario_df)} rows")
    print(f"  Columns: {list(diario_df.columns)}")
except Exception as e:
    print(f"✗ Error reading Diario file: {e}")
    diario_df = None

# Read from Mescladadas2, sheet=BD HT
try:
    mescladas_df = pd.read_excel(mescladas_file, sheet_name='BD HT')
    print(f"✓ Mescladas data loaded: {len(mescladas_df)} rows")
    print(f"  Columns: {list(mescladas_df.columns)}")
except Exception as e:
    print(f"✗ Error reading Mescladas file: {e}")
    mescladas_df = None


Reading data from Excel files...
✓ Diario data loaded: 460005 rows
  Columns: ['Cliente', 'Loja', 'Nome', 'Nom Paciente', 'DT Emissao', 'Tipo da nota', 'Numero', 'Serie Docto.', 'NF Eletr.', 'Vend. 1', 'Médico', 'Cliente.1', 'Opr ', 'Operador', 'Produto', 'Descricao', 'Qntd.', 'Valor Unitario', 'Valor Mercadoria', 'Total', 'Custo', 'Custo Unit', 'Desconto', 'Unidade', 'Mês', 'Ano', 'Cta-Ctbl', 'Cta-Ctbl Eugin', 'Interno/Externo', 'Descrição Gerencial', 'Descrição Mapping Actividad', 'Ciclos', 'Qnt Cons.']
✓ Mescladas data loaded: 597742 rows
  Columns: ['Grp', 'Filial', 'Doc', 'Data', 'TES', 'Descr.TES', 'Cod Cli', 'MedSof Cli', 'Nom Cliente', 'MedSof Pac.', 'Paciente', 'Medico', 'Nom Medico', 'Produto', 'Descr.Prod.', 'Cta.Contab.', 'NFSe', 'Serie', 'Qtde', 'Vlr Venda', 'Vlr Desconto', 'Ciclos', 'Descrição Mapping Actividad', 'Descrição Gerencial', 'Lead Time', 'Data do Ciclo', 'Fez Ciclo?']


In [8]:
colunas = []
for coluna in diario_df.columns:
    colunas.append(coluna.strip())
diario_df.columns = colunas

colunas = []
for coluna in mescladas_df.columns:
    colunas.append(coluna.strip())
mescladas_df.columns = colunas


In [9]:
diario_df.columns

Index(['Cliente', 'Loja', 'Nome', 'Nom Paciente', 'DT Emissao', 'Tipo da nota',
       'Numero', 'Serie Docto.', 'NF Eletr.', 'Vend. 1', 'Médico', 'Cliente.1',
       'Opr', 'Operador', 'Produto', 'Descricao', 'Qntd.', 'Valor Unitario',
       'Valor Mercadoria', 'Total', 'Custo', 'Custo Unit', 'Desconto',
       'Unidade', 'Mês', 'Ano', 'Cta-Ctbl', 'Cta-Ctbl Eugin',
       'Interno/Externo', 'Descrição Gerencial', 'Descrição Mapping Actividad',
       'Ciclos', 'Qnt Cons.'],
      dtype='object')

In [None]:

if diario_df is not None and mescladas_df is not None:
    print("\n" + "="*60)
    print("DATA COMPARISON ANALYSIS")
    print("="*60)
    
    # Date range filter: January 2024 to June 2025
    start_date = pd.to_datetime('2024-01-01')
    end_date = pd.to_datetime('2025-06-30')
    print(f"\n📅 Date filter applied: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
    
    # Clean and prepare Diario data
    print("\n1. Preparing Diario data...")
    diario_clean = diario_df[['Cliente', 'Nome', 'Nom Paciente', 'DT Emissao', 'Total']].copy()
    diario_clean['Cliente'] = diario_clean['Cliente'].astype(str).str.strip().str.replace('nan', '')
    diario_clean['Total'] = pd.to_numeric(diario_clean['Total'], errors='coerce')
    
    # Convert DT Emissao to datetime and apply date filter
    diario_clean['DT Emissao'] = pd.to_datetime(diario_clean['DT Emissao'], errors='coerce')
    diario_clean = diario_clean[(diario_clean['DT Emissao'] >= start_date) & (diario_clean['DT Emissao'] <= end_date)]
    print(f"   Rows after date filter: {len(diario_clean)}")
    
    # Group by Cliente and sum Total
    diario_grouped = diario_clean.groupby('Cliente')['Total'].sum().reset_index()
    diario_grouped.columns = ['Cliente', 'Total_Diario']
    print(f"   Unique clients in Diario: {len(diario_grouped)}")
    
    # Clean and prepare Mescladas data
    print("\n2. Preparing Mescladas data...")
    mescladas_clean = mescladas_df[['Cod Cli', 'MedSof Cli', 'Nom Cliente', 'MedSof Pac.', 'Paciente', 'Data', 'Vlr Venda', 'Vlr Desconto']].copy()
    mescladas_clean['Cod Cli'] = mescladas_clean['Cod Cli'].astype(str).str.strip()
    mescladas_clean['Vlr Venda'] = pd.to_numeric(mescladas_clean['Vlr Venda'], errors='coerce')
    mescladas_clean['Vlr Desconto'] = pd.to_numeric(mescladas_clean['Vlr Desconto'], errors='coerce')
    
    # Convert Data to datetime and apply date filter
    mescladas_clean['Data'] = pd.to_datetime(mescladas_clean['Data'], errors='coerce')
    mescladas_clean = mescladas_clean[(mescladas_clean['Data'] >= start_date) & (mescladas_clean['Data'] <= end_date)]
    print(f"   Rows after date filter: {len(mescladas_clean)}")
    
    # Calculate net value (Venda - Desconto)
    mescladas_clean['Net_Value'] = mescladas_clean['Vlr Venda'] #- mescladas_clean['Vlr Desconto']
    
    # Group by Cod Cli and sum Net_Value
    mescladas_grouped = mescladas_clean.groupby('Cod Cli')['Net_Value'].sum().reset_index()
    mescladas_grouped.columns = ['Cod Cli', 'Total_Mescladas']
    print(f"   Unique clients in Mescladas: {len(mescladas_grouped)}")
    
    # Check client overlap before merging
    print("\n3. Analyzing client overlap...")
    
    # Convert client codes to integers for proper comparison
    diario_clients = set(diario_grouped['Cliente'].astype(str).str.strip())
    mescladas_clients = set(mescladas_grouped['Cod Cli'].astype(str).str.strip())
    
    # Find clients only in each dataset
    only_in_diario = diario_clients - mescladas_clients
    only_in_mescladas = mescladas_clients - diario_clients
    in_both = diario_clients & mescladas_clients
    
    print(f"   Clients only in Diario: {len(only_in_diario)}")
    print(f"   Clients only in Mescladas: {len(only_in_mescladas)}")
    print(f"   Clients in both datasets: {len(in_both)}")
    print(f"   Total unique clients: {len(diario_clients | mescladas_clients)}")
    
    # Show some examples of clients only in each dataset
    if len(only_in_diario) > 0:
        print(f"\n   Examples of clients only in Diario (first 10):")
        for client in list(only_in_diario)[:10]:
            print(f"     - {client}")
    
    if len(only_in_mescladas) > 0:
        print(f"\n   Examples of clients only in Mescladas (first 10):")
        for client in list(only_in_mescladas)[:10]:
            print(f"     - {client}")
    
    # Merge the data for comparison
    print("\n4. Merging data for comparison...")
    comparison_df = pd.merge(
        diario_grouped, 
        mescladas_grouped, 
        left_on='Cliente', 
        right_on='Cod Cli', 
        how='outer'
    )
    
    # Fill NaN values with 0 for clients that don't exist in one of the datasets
    comparison_df['Total_Diario'] = comparison_df['Total_Diario'].fillna(0)
    comparison_df['Total_Mescladas'] = comparison_df['Total_Mescladas'].fillna(0)
    
    # Calculate differences
    comparison_df['Difference'] = comparison_df['Total_Diario'] - comparison_df['Total_Mescladas']
    comparison_df['Abs_Difference'] = abs(comparison_df['Difference'])
    
    # Sort by absolute difference (biggest differences first)
    comparison_df = comparison_df.sort_values('Abs_Difference', ascending=False)
    
    print(f"   Total clients compared: {len(comparison_df)}")
    
    # Display results
    print("\n" + "="*80)
    print("BIGGEST DIFFERENCES (Top 20)")
    print("="*80)
    print(f"{'Cliente':<15} {'Total_Diario':<15} {'Total_Mescladas':<15} {'Difference':<15} {'Abs_Diff':<15}")
    print("-" * 80)
    
    for idx, row in comparison_df.head(20).iterrows():
        cliente = str(row['Cliente'])[:14] if pd.notna(row['Cliente']) else 'N/A'
        print(f"{cliente:<15} {row['Total_Diario']:<15.2f} {row['Total_Mescladas']:<15.2f} {row['Difference']:<15.2f} {row['Abs_Difference']:<15.2f}")
    
    # Summary statistics
    print("\n" + "="*60)
    print("SUMMARY STATISTICS")
    print("="*60)
    print(f"Total clients in Diario only: {len(comparison_df[comparison_df['Total_Mescladas'] == 0])}")
    print(f"Total clients in Mescladas only: {len(comparison_df[comparison_df['Total_Diario'] == 0])}")
    print(f"Total clients in both datasets: {len(comparison_df[(comparison_df['Total_Diario'] > 0) & (comparison_df['Total_Mescladas'] > 0)])}")
    print(f"Average absolute difference: {comparison_df['Abs_Difference'].mean():.2f}")
    print(f"Maximum absolute difference: {comparison_df['Abs_Difference'].max():.2f}")
    print(f"Total difference (Diario - Mescladas): {comparison_df['Difference'].sum():.2f}")
    
    # Clients with exact matches
    exact_matches = len(comparison_df[comparison_df['Abs_Difference'] == 0])
    print(f"Exact matches: {exact_matches}")
    
    # # Save detailed results to CSV
    # output_file = "finops/data_input/comparison_results.csv"
    # comparison_df.to_csv(output_file, index=False)
    # print(f"\nDetailed results saved to: {output_file}")
    
    # Display the comparison dataframe
    print("\n" + "="*60)
    print("COMPARISON DATAFRAME (First 10 rows)")
    print("="*60)
    display(comparison_df.head(20))
    
else:
    print("Cannot proceed with comparison due to data loading errors.")


DATA COMPARISON ANALYSIS

📅 Date filter applied: 2024-01-01 to 2025-06-30

1. Preparing Diario data...
   Rows after date filter: 284240
   Unique clients in Diario: 24915

2. Preparing Mescladas data...
   Rows after date filter: 284019
   Unique clients in Mescladas: 25127

3. Merging data for comparison...
   Total clients compared: 50042

BIGGEST DIFFERENCES (Top 20)
Cliente         Total_Diario    Total_Mescladas Difference      Abs_Diff       
--------------------------------------------------------------------------------
                3928599.26      0.00            3928599.26      3928599.26     
N/A             0.00            917762.26       -917762.26      917762.26      
N/A             0.00            508489.00       -508489.00      508489.00      
N/A             0.00            441096.56       -441096.56      441096.56      
N/A             0.00            259088.60       -259088.60      259088.60      
N/A             0.00            197966.34       -197966.34      

Unnamed: 0,Cliente,Total_Diario,Cod Cli,Total_Mescladas,Difference,Abs_Difference
0,,3928599.26,,0.0,3928599.26,3928599.26
1598,,0.0,50087.0,917762.26,-917762.26,917762.26
10942,,0.0,505730.0,508489.0,-508489.0,508489.0
294,,0.0,18583.0,441096.56,-441096.56,441096.56
3401,,0.0,70210.0,259088.6,-259088.6,259088.6
41631,,0.0,879876.0,197966.34,-197966.34,197966.34
28674,838907.0,197966.34,,0.0,197966.34,197966.34
24849,821297.0,194482.4,,0.0,194482.4,194482.4
37823,,0.0,873287.0,194482.4,-194482.4,194482.4
39165,,0.0,875859.0,193839.33,-193839.33,193839.33


In [20]:
# Additional analysis: Convert client codes to integers and check overlap
print("\n" + "="*60)
print("DETAILED CLIENT OVERLAP ANALYSIS")
print("="*60)

# Convert client codes to integers for proper comparison
def safe_int_convert(series):
    """Safely convert series to integers, handling non-numeric values"""
    converted = []
    for val in series:
        try:
            # Remove any non-numeric characters and convert
            clean_val = str(val).strip().replace('.0', '')
            if clean_val and clean_val != 'nan' and clean_val != '':
                converted.append(int(float(clean_val)))
            else:
                converted.append(None)
        except (ValueError, TypeError):
            converted.append(None)
    return converted

# Convert client codes to integers
diario_int_clients = [x for x in safe_int_convert(diario_grouped['Cliente']) if x is not None]
mescladas_int_clients = [x for x in safe_int_convert(mescladas_grouped['Cod Cli']) if x is not None]

print(f"Diario clients (as integers): {len(diario_int_clients)}")
print(f"Mescladas clients (as integers): {len(mescladas_int_clients)}")

# Find overlap using integer comparison
diario_int_set = set(diario_int_clients)
mescladas_int_set = set(mescladas_int_clients)

only_in_diario_int = diario_int_set - mescladas_int_set
only_in_mescladas_int = mescladas_int_set - diario_int_set
in_both_int = diario_int_set & mescladas_int_set

print(f"\nInteger-based comparison:")
print(f"   Clients only in Diario: {len(only_in_diario_int)}")
print(f"   Clients only in Mescladas: {len(only_in_mescladas_int)}")
print(f"   Clients in both datasets: {len(in_both_int)}")
print(f"   Total unique clients: {len(diario_int_set | mescladas_int_set)}")

# Show examples of clients only in each dataset (as integers)
if len(only_in_diario_int) > 0:
    print(f"\n   Examples of clients only in Diario (first 10):")
    for client in sorted(list(only_in_diario_int))[:10]:
        print(f"     - {client}")

if len(only_in_mescladas_int) > 0:
    print(f"\n   Examples of clients only in Mescladas (first 10):")
    for client in sorted(list(only_in_mescladas_int))[:10]:
        print(f"     - {client}")

# Calculate overlap percentage
total_clients = len(diario_int_set | mescladas_int_set)
overlap_percentage = (len(in_both_int) / total_clients) * 100 if total_clients > 0 else 0

print(f"\nOverlap Statistics:")
print(f"   Overlap percentage: {overlap_percentage:.2f}%")
print(f"   Diario coverage: {(len(in_both_int) / len(diario_int_set)) * 100:.2f}% of Diario clients are in Mescladas")
print(f"   Mescladas coverage: {(len(in_both_int) / len(mescladas_int_set)) * 100:.2f}% of Mescladas clients are in Diario")

# Additional comparison: Cliente vs MedSof Cli
print("\n" + "="*60)
print("COMPARISON: Cliente vs MedSof Cli")
print("="*60)

# Prepare MedSof Cli data
mescladas_medsoft_clean = mescladas_df[['MedSof Cli', 'Nom Cliente', 'Data', 'Vlr Venda', 'Vlr Desconto']].copy()
mescladas_medsoft_clean['MedSof Cli'] = mescladas_medsoft_clean['MedSof Cli'].astype(str).str.strip()
mescladas_medsoft_clean['Vlr Venda'] = pd.to_numeric(mescladas_medsoft_clean['Vlr Venda'], errors='coerce')
mescladas_medsoft_clean['Vlr Desconto'] = pd.to_numeric(mescladas_medsoft_clean['Vlr Desconto'], errors='coerce')

# Apply date filter
mescladas_medsoft_clean['Data'] = pd.to_datetime(mescladas_medsoft_clean['Data'], errors='coerce')
mescladas_medsoft_clean = mescladas_medsoft_clean[(mescladas_medsoft_clean['Data'] >= start_date) & (mescladas_medsoft_clean['Data'] <= end_date)]

# Calculate net value and group by MedSof Cli
mescladas_medsoft_clean['Net_Value'] = mescladas_medsoft_clean['Vlr Venda'] #- mescladas_medsoft_clean['Vlr Desconto']
mescladas_medsoft_grouped = mescladas_medsoft_clean.groupby('MedSof Cli')['Net_Value'].sum().reset_index()
mescladas_medsoft_grouped.columns = ['MedSof Cli', 'Total_MedSof']

print(f"MedSof Cli clients: {len(mescladas_medsoft_grouped)}")

# Convert to integers for comparison
diario_clients_medsoft = [x for x in safe_int_convert(diario_grouped['Cliente']) if x is not None]
medsoft_int_clients = [x for x in safe_int_convert(mescladas_medsoft_grouped['MedSof Cli']) if x is not None]

print(f"Diario clients (as integers): {len(diario_clients_medsoft)}")
print(f"MedSof Cli clients (as integers): {len(medsoft_int_clients)}")

# Find overlap between Cliente and MedSof Cli
diario_medsoft_set = set(diario_clients_medsoft)
medsoft_int_set = set(medsoft_int_clients)

only_in_diario_medsoft = diario_medsoft_set - medsoft_int_set
only_in_medsoft = medsoft_int_set - diario_medsoft_set
in_both_medsoft = diario_medsoft_set & medsoft_int_set

print(f"\nCliente vs MedSof Cli comparison:")
print(f"   Clients only in Diario: {len(only_in_diario_medsoft)}")
print(f"   Clients only in MedSof Cli: {len(only_in_medsoft)}")
print(f"   Clients in both datasets: {len(in_both_medsoft)}")
print(f"   Total unique clients: {len(diario_medsoft_set | medsoft_int_set)}")

# Show examples
if len(only_in_diario_medsoft) > 0:
    print(f"\n   Examples of clients only in Diario (first 10):")
    for client in sorted(list(only_in_diario_medsoft))[:10]:
        print(f"     - {client}")

if len(only_in_medsoft) > 0:
    print(f"\n   Examples of clients only in MedSof Cli (first 10):")
    for client in sorted(list(only_in_medsoft))[:10]:
        print(f"     - {client}")

# Calculate overlap percentage for MedSof comparison
total_clients_medsoft = len(diario_medsoft_set | medsoft_int_set)
overlap_percentage_medsoft = (len(in_both_medsoft) / total_clients_medsoft) * 100 if total_clients_medsoft > 0 else 0

print(f"\nMedSof Cli Overlap Statistics:")
print(f"   Overlap percentage: {overlap_percentage_medsoft:.2f}%")
print(f"   Diario coverage: {(len(in_both_medsoft) / len(diario_medsoft_set)) * 100:.2f}% of Diario clients are in MedSof Cli")
print(f"   MedSof Cli coverage: {(len(in_both_medsoft) / len(medsoft_int_set)) * 100:.2f}% of MedSof Cli clients are in Diario")



DETAILED CLIENT OVERLAP ANALYSIS
Diario clients (as integers): 24914
Mescladas clients (as integers): 25123

Integer-based comparison:
   Clients only in Diario: 17861
   Clients only in Mescladas: 17897
   Clients in both datasets: 7053
   Total unique clients: 42811

   Examples of clients only in Diario (first 10):
     - 17
     - 519
     - 578
     - 801
     - 1261
     - 1581
     - 1875
     - 2125
     - 2169
     - 2174

   Examples of clients only in Mescladas (first 10):
     - 1
     - 42
     - 106
     - 225
     - 354
     - 440
     - 447
     - 497
     - 501
     - 610

Overlap Statistics:
   Overlap percentage: 16.47%
   Diario coverage: 28.31% of Diario clients are in Mescladas
   Mescladas coverage: 28.27% of Mescladas clients are in Diario

COMPARISON: Cliente vs MedSof Cli
MedSof Cli clients: 20600
Diario clients (as integers): 24914
MedSof Cli clients (as integers): 20599

Cliente vs MedSof Cli comparison:
   Clients only in Diario: 4464
   Clients only in Me

In [22]:
# Financial comparison: Cliente vs MedSof Cli
print("\n" + "="*60)
print("FINANCIAL COMPARISON: Cliente vs MedSof Cli")
print("="*60)

# Merge Diario with MedSof Cli for financial comparison
comparison_medsoft_df = pd.merge(
    diario_grouped, 
    mescladas_medsoft_grouped, 
    left_on='Cliente', 
    right_on='MedSof Cli', 
    how='outer'
)

# Fill NaN values with 0
comparison_medsoft_df['Total_Diario'] = comparison_medsoft_df['Total_Diario'].fillna(0)
comparison_medsoft_df['Total_MedSof'] = comparison_medsoft_df['Total_MedSof'].fillna(0)

# Calculate differences
comparison_medsoft_df['Difference'] = comparison_medsoft_df['Total_Diario'] - comparison_medsoft_df['Total_MedSof']
comparison_medsoft_df['Abs_Difference'] = abs(comparison_medsoft_df['Difference'])

# Sort by absolute difference (biggest differences first)
comparison_medsoft_df = comparison_medsoft_df.sort_values('Abs_Difference', ascending=False)

print(f"Total clients compared (Cliente vs MedSof Cli): {len(comparison_medsoft_df)}")

# Display results
print("\n" + "="*80)
print("BIGGEST DIFFERENCES: Cliente vs MedSof Cli (Top 20)")
print("="*80)
print(f"{'Cliente':<15} {'Total_Diario':<15} {'Total_MedSof':<15} {'Difference':<15} {'Abs_Diff':<15}")
print("-" * 80)

for idx, row in comparison_medsoft_df.head(20).iterrows():
    cliente = str(row['Cliente'])[:14] if pd.notna(row['Cliente']) else 'N/A'
    print(f"{cliente:<15} {row['Total_Diario']:<15.2f} {row['Total_MedSof']:<15.2f} {row['Difference']:<15.2f} {row['Abs_Difference']:<15.2f}")

# Summary statistics for MedSof comparison
print("\n" + "="*60)
print("SUMMARY STATISTICS: Cliente vs MedSof Cli")
print("="*60)
print(f"Total clients in Diario only: {len(comparison_medsoft_df[comparison_medsoft_df['Total_MedSof'] == 0])}")
print(f"Total clients in MedSof Cli only: {len(comparison_medsoft_df[comparison_medsoft_df['Total_Diario'] == 0])}")
print(f"Total clients in both datasets: {len(comparison_medsoft_df[(comparison_medsoft_df['Total_Diario'] > 0) & (comparison_medsoft_df['Total_MedSof'] > 0)])}")
print(f"Average absolute difference: {comparison_medsoft_df['Abs_Difference'].mean():.2f}")
print(f"Maximum absolute difference: {comparison_medsoft_df['Abs_Difference'].max():.2f}")
print(f"Total difference (Diario - MedSof Cli): {comparison_medsoft_df['Difference'].sum():.2f}")

# Clients with exact matches
exact_matches_medsoft = len(comparison_medsoft_df[comparison_medsoft_df['Abs_Difference'] == 0])
print(f"Exact matches: {exact_matches_medsoft}")

# # Save detailed results to CSV
# output_file_medsoft = "finops/data_input/comparison_medsoft_results.csv"
# comparison_medsoft_df.to_csv(output_file_medsoft, index=False)
# print(f"\nDetailed MedSof Cli results saved to: {output_file_medsoft}")

# Display the comparison dataframe
print("\n" + "="*60)
print("COMPARISON DATAFRAME: Cliente vs MedSof Cli (First 10 rows)")
print("="*60)
display(comparison_medsoft_df.head(10))



FINANCIAL COMPARISON: Cliente vs MedSof Cli
Total clients compared (Cliente vs MedSof Cli): 25065

BIGGEST DIFFERENCES: Cliente vs MedSof Cli (Top 20)
Cliente         Total_Diario    Total_MedSof    Difference      Abs_Diff       
--------------------------------------------------------------------------------
N/A             0.00            49459479.54     -49459479.54    49459479.54    
                3928599.26      0.00            3928599.26      3928599.26     
41394.0         185024.62       0.00            185024.62       185024.62      
49852.0         162268.00       0.00            162268.00       162268.00      
810849.0        155453.07       0.00            155453.07       155453.07      
821879.0        148138.10       0.00            148138.10       148138.10      
825203.0        133228.93       0.00            133228.93       133228.93      
N/A             0.00            130016.95       -130016.95      130016.95      
839283.0        126424.64       0.00           

Unnamed: 0,Cliente,Total_Diario,MedSof Cli,Total_MedSof,Difference,Abs_Difference
25064,,0.0,,49459479.54,-49459479.54,49459479.54
0,,3928599.26,,0.0,3928599.26,3928599.26
1624,41394.0,185024.62,,0.0,185024.62,185024.62
2364,49852.0,162268.0,,0.0,162268.0,162268.0
12394,810849.0,155453.07,,0.0,155453.07,155453.07
13696,821879.0,148138.1,,0.0,148138.1,148138.1
14166,825203.0,133228.93,,0.0,133228.93,133228.93
24891,,0.0,889553.0,130016.95,-130016.95,130016.95
16930,839283.0,126424.64,,0.0,126424.64,126424.64
18193,845370.0,124056.95,,0.0,124056.95,124056.95


In [23]:
# List of Cliente that are NOT in MedSof Cli
print("="*80)
print("CLIENTS IN DIARIO BUT NOT IN MEDSOF CLI")
print("="*80)

# Get clients only in Diario (not in MedSof Cli)
clients_only_in_diario = sorted(list(only_in_diario_medsoft))

print(f"Total clients in Diario but not in MedSof Cli: {len(clients_only_in_diario)}")
print(f"\nFirst 50 clients (sorted by client code):")
print("-" * 50)

for i, client in enumerate(clients_only_in_diario[:50], 1):
    print(f"{i:3d}. {client}")

if len(clients_only_in_diario) > 50:
    print(f"\n... and {len(clients_only_in_diario) - 50} more clients")

# Show some statistics about these clients
print(f"\n" + "="*60)
print("STATISTICS FOR CLIENTS ONLY IN DIARIO")
print("="*60)

# Get financial data for these clients
diario_only_financial = diario_grouped[diario_grouped['Cliente'].astype(str).str.strip().isin([str(c) for c in clients_only_in_diario])]

if len(diario_only_financial) > 0:
    total_revenue = diario_only_financial['Total_Diario'].sum()
    avg_revenue = diario_only_financial['Total_Diario'].mean()
    max_revenue = diario_only_financial['Total_Diario'].max()
    min_revenue = diario_only_financial['Total_Diario'].min()
    
    print(f"Total revenue from these clients: R$ {total_revenue:,.2f}")
    print(f"Average revenue per client: R$ {avg_revenue:,.2f}")
    print(f"Highest revenue client: R$ {max_revenue:,.2f}")
    print(f"Lowest revenue client: R$ {min_revenue:,.2f}")
    
    # Show top 10 revenue clients
    top_revenue = diario_only_financial.nlargest(10, 'Total_Diario')
    print(f"\nTop 10 revenue clients (only in Diario):")
    print("-" * 50)
    for idx, row in top_revenue.iterrows():
        print(f"Client {row['Cliente']}: R$ {row['Total_Diario']:,.2f}")

# Create a DataFrame for easy export
clients_not_in_medsoft_df = pd.DataFrame({
    'Cliente': clients_only_in_diario,
    'In_MedSof_Cli': False
})

# Add financial data if available
if len(diario_only_financial) > 0:
    clients_not_in_medsoft_df = clients_not_in_medsoft_df.merge(
        diario_only_financial[['Cliente', 'Total_Diario']], 
        on='Cliente', 
        how='left'
    )
    clients_not_in_medsoft_df['Total_Diario'] = clients_not_in_medsoft_df['Total_Diario'].fillna(0)

print(f"\n" + "="*60)
print("EXPORT INFORMATION")
print("="*60)
print(f"DataFrame created with {len(clients_not_in_medsoft_df)} clients")
print("Columns: Cliente, In_MedSof_Cli, Total_Diario (if available)")
print("\nTo export this list, run:")
print("clients_not_in_medsoft_df.to_csv('clients_not_in_medsoft.csv', index=False)")

# Display the first 20 rows
print(f"\n" + "="*60)
print("SAMPLE DATA (First 20 rows)")
print("="*60)
display(clients_not_in_medsoft_df.head(20))


CLIENTS IN DIARIO BUT NOT IN MEDSOF CLI
Total clients in Diario but not in MedSof Cli: 4464

First 50 clients (sorted by client code):
--------------------------------------------------
  1. 2
  2. 52
  3. 182
  4. 543
  5. 2401
  6. 3469
  7. 3641
  8. 3701
  9. 3713
 10. 3860
 11. 4411
 12. 4598
 13. 5376
 14. 5557
 15. 5579
 16. 5584
 17. 5606
 18. 5931
 19. 6019
 20. 6310
 21. 6400
 22. 6455
 23. 6587
 24. 6843
 25. 6959
 26. 7313
 27. 7483
 28. 7650
 29. 7656
 30. 7718
 31. 7899
 32. 8031
 33. 8103
 34. 8156
 35. 8296
 36. 8305
 37. 8368
 38. 8431
 39. 8621
 40. 8636
 41. 8662
 42. 8664
 43. 8995
 44. 9025
 45. 9031
 46. 9156
 47. 9208
 48. 9296
 49. 9300
 50. 9387

... and 4414 more clients

STATISTICS FOR CLIENTS ONLY IN DIARIO

EXPORT INFORMATION
DataFrame created with 4464 clients
Columns: Cliente, In_MedSof_Cli, Total_Diario (if available)

To export this list, run:
clients_not_in_medsoft_df.to_csv('clients_not_in_medsoft.csv', index=False)

SAMPLE DATA (First 20 rows)


Unnamed: 0,Cliente,In_MedSof_Cli
0,2,False
1,52,False
2,182,False
3,543,False
4,2401,False
5,3469,False
6,3641,False
7,3701,False
8,3713,False
9,3860,False


In [None]:
# Count lines where Cod Cli != MedSof Cli (both not null) - Convert to numbers first
print("="*80)
print("ANALYSIS: Cod Cli vs MedSof Cli DIFFERENCES IN MESCLADAS (NUMERIC COMPARISON)")
print("="*80)

# Filter for rows where both Cod Cli and MedSof Cli are not null
mescladas_both_not_null = mescladas_df[
    (mescladas_df['Cod Cli'].notna()) & 
    (mescladas_df['MedSof Cli'].notna()) &
    (mescladas_df['Cod Cli'].astype(str).str.strip() != '') &
    (mescladas_df['MedSof Cli'].astype(str).str.strip() != '') &
    (mescladas_df['Cod Cli'].astype(str).str.strip() != 'nan') &
    (mescladas_df['MedSof Cli'].astype(str).str.strip() != 'nan')
].copy()

print(f"Total rows in Mescladas: {len(mescladas_df)}")
print(f"Rows where both Cod Cli and MedSof Cli are not null: {len(mescladas_both_not_null)}")

# Convert to numbers for comparison
def safe_numeric_convert(series):
    """Safely convert series to numeric, handling non-numeric values"""
    converted = []
    for val in series:
        try:
            # Remove any non-numeric characters and convert
            clean_val = str(val).strip().replace('.0', '')
            if clean_val and clean_val != 'nan' and clean_val != '':
                converted.append(float(clean_val))
            else:
                converted.append(None)
        except (ValueError, TypeError):
            converted.append(None)
    return converted

# Convert both columns to numeric
mescladas_both_not_null['Cod Cli_num'] = safe_numeric_convert(mescladas_both_not_null['Cod Cli'])
mescladas_both_not_null['MedSof Cli_num'] = safe_numeric_convert(mescladas_both_not_null['MedSof Cli'])

# Filter out rows where conversion failed
mescladas_numeric = mescladas_both_not_null[
    (mescladas_both_not_null['Cod Cli_num'].notna()) & 
    (mescladas_both_not_null['MedSof Cli_num'].notna())
].copy()

print(f"Rows where both can be converted to numbers: {len(mescladas_numeric)}")
print(f"Rows where conversion failed: {len(mescladas_both_not_null) - len(mescladas_numeric)}")

# Count where they are different (numeric comparison)
different_codes = mescladas_numeric[
    mescladas_numeric['Cod Cli_num'] != mescladas_numeric['MedSof Cli_num']
]

same_codes = mescladas_numeric[
    mescladas_numeric['Cod Cli_num'] == mescladas_numeric['MedSof Cli_num']
]

print(f"\nRows where Cod Cli == MedSof Cli (numeric): {len(same_codes)}")
print(f"Rows where Cod Cli != MedSof Cli (numeric): {len(different_codes)}")
print(f"Percentage different: {(len(different_codes) / len(mescladas_numeric)) * 100:.2f}%")

# Show some examples of different codes
if len(different_codes) > 0:
    print(f"\n" + "="*60)
    print("EXAMPLES OF DIFFERENT CODES (First 20) - NUMERIC COMPARISON")
    print("="*60)
    print(f"{'Cod Cli':<15} {'MedSof Cli':<15} {'Data':<12} {'Vlr Venda':<12}")
    print("-" * 60)
    
    sample_different = different_codes.head(20)
    for idx, row in sample_different.iterrows():
        cod_cli = f"{row['Cod Cli_num']:.0f}" if pd.notna(row['Cod Cli_num']) else 'N/A'
        medsoft_cli = f"{row['MedSof Cli_num']:.0f}" if pd.notna(row['MedSof Cli_num']) else 'N/A'
        data = str(row['Data'])[:10] if pd.notna(row['Data']) else 'N/A'
        vlr_venda = f"{row['Vlr Venda']:,.2f}" if pd.notna(row['Vlr Venda']) else 'N/A'
        print(f"{cod_cli:<15} {medsoft_cli:<15} {data:<12} {vlr_venda:<12}")

# Show some examples of same codes for comparison
if len(same_codes) > 0:
    print(f"\n" + "="*60)
    print("EXAMPLES OF SAME CODES (First 10) - NUMERIC COMPARISON")
    print("="*60)
    print(f"{'Cod Cli':<15} {'MedSof Cli':<15} {'Data':<12} {'Vlr Venda':<12}")
    print("-" * 60)
    
    sample_same = same_codes.head(10)
    for idx, row in sample_same.iterrows():
        cod_cli = f"{row['Cod Cli_num']:.0f}" if pd.notna(row['Cod Cli_num']) else 'N/A'
        medsoft_cli = f"{row['MedSof Cli_num']:.0f}" if pd.notna(row['MedSof Cli_num']) else 'N/A'
        data = str(row['Data'])[:10] if pd.notna(row['Data']) else 'N/A'
        vlr_venda = f"{row['Vlr Venda']:,.2f}" if pd.notna(row['Vlr Venda']) else 'N/A'
        print(f"{cod_cli:<15} {medsoft_cli:<15} {data:<12} {vlr_venda:<12}")

# Financial impact of different codes
if len(different_codes) > 0:
    print(f"\n" + "="*60)
    print("FINANCIAL IMPACT OF DIFFERENT CODES")
    print("="*60)
    
    total_venda_different = different_codes['Vlr Venda'].sum()
    total_venda_same = same_codes['Vlr Venda'].sum()
    
    print(f"Total Vlr Venda for different codes: R$ {total_venda_different:,.2f}")
    print(f"Total Vlr Venda for same codes: R$ {total_venda_same:,.2f}")
    print(f"Percentage of revenue from different codes: {(total_venda_different / (total_venda_different + total_venda_same)) * 100:.2f}%")

# Create summary DataFrame
summary_df = pd.DataFrame({
    'Category': ['Same Codes (Numeric)', 'Different Codes (Numeric)', 'Total Numeric', 'Conversion Failed'],
    'Count': [len(same_codes), len(different_codes), len(mescladas_numeric), len(mescladas_both_not_null) - len(mescladas_numeric)],
    'Percentage': [
        (len(same_codes) / len(mescladas_numeric)) * 100 if len(mescladas_numeric) > 0 else 0,
        (len(different_codes) / len(mescladas_numeric)) * 100 if len(mescladas_numeric) > 0 else 0,
        100.0,
        ((len(mescladas_both_not_null) - len(mescladas_numeric)) / len(mescladas_both_not_null)) * 100 if len(mescladas_both_not_null) > 0 else 0
    ]
})

print(f"\n" + "="*60)
print("SUMMARY TABLE")
print("="*60)
display(summary_df)


ANALYSIS: Cod Cli vs MedSof Cli DIFFERENCES IN MESCLADAS (NUMERIC COMPARISON)
Total rows in Mescladas: 597742
Rows where both Cod Cli and MedSof Cli are not null: 454197
Rows where both can be converted to numbers: 454064
Rows where conversion failed: 133

Rows where Cod Cli == MedSof Cli (numeric): 37566
Rows where Cod Cli != MedSof Cli (numeric): 416498
Percentage different: 91.73%

EXAMPLES OF DIFFERENT CODES (First 20) - NUMERIC COMPARISON
Cod Cli         MedSof Cli      Data         Vlr Venda   
------------------------------------------------------------
896724          889591          2025-09-01   1,100.00    
877711          833397          2025-09-01   160.00      
877711          833397          2025-09-01   160.00      
33146           35748           2025-09-01   125.00      
36022           38323           2025-09-01   131.00      
51369           53278           2025-09-01   131.00      
53576           55772           2025-09-01   131.00      
57825           60124      

Unnamed: 0,Category,Count,Percentage
0,Same Codes,37566,8.27086
1,Different Codes,416498,91.699857
2,Total,454197,100.0
