# RFC obtaintion (RFC format validation and name cleaning)
The goal is to read a csv file with 3 columns ("RFC", "RAZON" and "AÑO"), then to validate RFC format, to clean column "RAZON" by applying Regex techniques to save the cleaned text in column "NOMBRE", and to add column "PERSON" if the RFC corresponds to a "fisica" or "moral" person. Finally, a SLM is implementd to tokenizate and lemmatizate the text in column "NOMBRE" and group by this text similarity. The cleaned DataFrame is saved as "NuevoRFC.csv". This DataFrame can be concated to other DataFrame (optional).

**Remark:** This Jupyter Notebook can be run in Google-Colab.

In [79]:
%pip install thefuzz



In [80]:
# Import libraries
import pandas as pd
import re
from typing import Optional
import re
import spacy # Recommended for Spanish
from rapidfuzz import fuzz

## Read non-processed csv file

In [81]:
column_names = ['RFC','RAZON','AÑO']
year = 2025

# Create DataFrame
df = pd.DataFrame({
    'RFC': ['AAA08091161A', 'AAA1002249W5', 'AAA090924HJ4', 'AVM090924HJ4',
            'AAA1002249W6', 'AAA1002249W7', 'BBB08091161A', 'FEM1002249W5',
            'BBV08091161A', 'AAA1002249W8', 'AAA1002249W9', 'AAA1002249W1',
            'AAA1002249W2', 'BBB08091161A', 'FEM1002249W5', 'AAA1002249W5',
            'AVM090924HJ4'],
    'RAZON': ['APOYANDO A ANGELITOS CON AUTISMO, A. C.',
              'LA PASADITA',
              'ARGUELLES, ALVAREZ & ASOCIADOS SA DE CV',
              'ADMINISTRATIVAS APLICACIONES AVM SC',
              'ADAIR ALONSO ARQUITECTOS SA DE CV',
              'Waltmart',
              'BBVA BANCOMER',
              'COCA COLA FEMSA CV',
              'BANCOMER BBVA S.A.',
              'Waltmart de Mexico',
              'apoyando angelitos con autismo ac',
              'MC SA DE CV',
              'S.A. DE C.V. MC',
              'BBVA BANCOMER',
              'FEMSA COCA-COLA',
              'ABARROTES LA PASADITA',
              'AGROINDUSTRIAS APLICACIONES ADMINISTRATIVAS AVM SC',]
})

# Add year
df['AÑO'] = year

df.head()

Unnamed: 0,RFC,RAZON,AÑO
0,AAA08091161A,"APOYANDO A ANGELITOS CON AUTISMO, A. C.",2025
1,AAA1002249W5,LA PASADITA,2025
2,AAA090924HJ4,"ARGUELLES, ALVAREZ & ASOCIADOS SA DE CV",2025
3,AVM090924HJ4,ADMINISTRATIVAS APLICACIONES AVM SC,2025
4,AAA1002249W6,ADAIR ALONSO ARQUITECTOS SA DE CV,2025


In [28]:
"""
csv_file = 'prueba.csv'
column_names = ['RFC','RAZON','AÑO']
df = pd.read_csv(csv_file, sep=',', header=None, names=column_names)
df.head(5)
"""

"\ncsv_file = 'prueba.csv'\ncolumn_names = ['RFC','RAZON']\ndf = pd.read_csv(csv_file, sep=',', header=None, names=column_names)\ndf.head(5)\n"

In [82]:
initial_rows = df.shape[0]
print(f"El archivo tiene {initial_rows} filas.")

# Delete all rows with null values in some columns
df = df.dropna()

# Change column types
df[column_names] = df[column_names].astype('str')

El archivo tiene 17 filas.


## Validate RFC

In [83]:
# Regular expressions
RFC_FISICA_REGEX = re.compile(
    r"^[A-ZÑ&]{4}"
    r"\d{2}(0[1-9]|1[0-2])"
    r"(0[1-9]|[12]\d|3[01])"
    r"[A-Z0-9]{3}$"
)

RFC_MORAL_REGEX = re.compile(
    r"^[A-ZÑ&]{3}"
    r"\d{2}(0[1-9]|1[0-2])"
    r"(0[1-9]|[12]\d|3[01])"
    r"[A-Z0-9]{3}$"
)

def normalize_rfc(rfc: str) -> str:
    """Limpia espacios y convierte a mayúsculas"""
    return rfc.strip().strip(".").strip().upper()

# Normalize RFC
df['RFC'] = df['RFC'].apply(normalize_rfc)

In [85]:
def is_persona_fisica(rfc: str) -> bool:
    """Verifica si el RFC corresponde a persona física"""
    return bool(RFC_FISICA_REGEX.match(rfc))


def is_persona_moral(rfc: str) -> bool:
    """Verifica si el RFC corresponde a persona moral"""
    return bool(RFC_MORAL_REGEX.match(rfc))


def get_rfc_type(rfc: str) -> Optional[str]:
    """Validate RFC"""

    if is_persona_fisica(rfc):
        return "FISICA"
    if is_persona_moral(rfc):
        return "MORAL"

    return None

# Validate RFC
df['PERSONA'] = df['RFC'].apply(get_rfc_type)

# Filter
df = df[(df['PERSONA'] == 'FISICA') | (df['PERSONA'] == 'MORAL')]

df.head()

Unnamed: 0,RFC,RAZON,AÑO,PERSONA
0,AAA08091161A,"APOYANDO A ANGELITOS CON AUTISMO, A. C.",2025,MORAL
1,AAA1002249W5,LA PASADITA,2025,MORAL
2,AAA090924HJ4,"ARGUELLES, ALVAREZ & ASOCIADOS SA DE CV",2025,MORAL
3,AVM090924HJ4,ADMINISTRATIVAS APLICACIONES AVM SC,2025,MORAL
4,AAA1002249W6,ADAIR ALONSO ARQUITECTOS SA DE CV,2025,MORAL


This process is applied to "MORAL" persons only.

In [87]:
df = df[df['PERSONA'] == 'MORAL']

final_rows = df.shape[0]
print(f"El archivo tiene {final_rows} RFCs válidos.")

El archivo tiene 17 RFCs válidos.


## Column "RAZON" preprocessing

In [88]:
def normalize_text(text: str, pattern: str, new_value: str) -> str:
    """Delete trash from text"""
    text = str(text).strip(".,; ").replace(",", "").replace(";", "").replace(".", "").upper()
    text = re.sub(pattern, new_value, text)
    text = re.sub(r'[/\\-]', ' ', text)
    return re.sub(r'\s+', ' ', text).strip()

# Patterns like S.A., C.V.
norm_rules = {
    r'\bS\.?\s?A\.? ': "SA ",
    r'\bS\.?\s?A\.?\b': "SA",
    r'\bC\.?\s?V\.?\b': "CV",
    r'\bS\.?\s?C\.?\b': "SC",
    r'\bS\.?\s?A\.?P\.?\s?I\.?\s?\b': "SAPI ",
    r'\bA\.?\s?C\.? ': "AC ",
    r'\bA\.?\s?C\.?\b': "AC",
}

# Normalize "RAZON"
df['NOMBRE'] = df[column_names[1]]

for pattern, replacement in norm_rules.items():
    df['NOMBRE'] = df['NOMBRE'].apply(
        normalize_text,
        pattern=pattern,
        new_value=replacement
    )

df.sample(5)

Unnamed: 0,RFC,RAZON,AÑO,PERSONA,NOMBRE
12,AAA1002249W2,S.A. DE C.V. MC,2025,MORAL,SA DE CV MC
4,AAA1002249W6,ADAIR ALONSO ARQUITECTOS SA DE CV,2025,MORAL,ADAIR ALONSO ARQUITECTOS SA DE CV
15,AAA1002249W5,ABARROTES LA PASADITA,2025,MORAL,ABARROTES LA PASADITA
9,AAA1002249W8,Waltmart de Mexico,2025,MORAL,WALTMART DE MEXICO
11,AAA1002249W1,MC SA DE CV,2025,MORAL,MC SA DE CV


In [35]:
# Download the model
!python -m spacy download es_core_news_sm

Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [36]:
# Install the model
# !pip install "es_core_news_sm-3"

In [89]:
def lemmatize_text(text: str) ->str:
    """Apply tokenization and lemmatization to the text"""
    # Load the model
    nlp = spacy.load("es_core_news_sm")

    # Tokenization
    doc = nlp(text.lower())

    # Lemmatization
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas).upper()

# Lemmatize
df['LEMMA_SPA'] = df['NOMBRE'].apply(lemmatize_text)

df[['NOMBRE', 'LEMMA_SPA']].head(5)

Unnamed: 0,NOMBRE,LEMMA_SPA
0,APOYANDO A ANGELITOS CON AUTISMO AC,APOYAR A ANGELITO CON AUTISMO AC
1,LA PASADITA,EL PASADITA
2,ARGUELLES ALVAREZ & ASOCIADOS SA DE CV,ARGUELL ALVAREZ & ASOCIADOS SA DE CV
3,ADMINISTRATIVAS APLICACIONES AVM SC,ADMINISTRATIVO APLICACIÓN AVM SC
4,ADAIR ALONSO ARQUITECTOS SA DE CV,ADAIR ALONSO ARQUITECTOS SA DE CV


## Group by words simility with SLM (Small Language Model)

In [90]:
# Sort by 'RFC'
df = df.sort_values(by="RFC", ascending=True).reset_index(drop=True)

def group_by_rfc_and_similarity(df:pd.DataFrame, umbral=80)-> pd.DataFrame:
    """Reorder df by column "LEMMA_SPA" according to rfc and words simility"""
    df_temp = df.copy()
    df_temp['group'] = -1
    grupo_actual = 0

    for i in range(len(df_temp)):
        if df_temp.loc[i, 'group'] == -1:
            df_temp.loc[i, 'group'] = grupo_actual
            rfc_base = df_temp.loc[i, 'RFC']
            nombre_base = df_temp.loc[i, 'LEMMA_SPA']

            # Compare with all following rows
            for j in range(i + 1, len(df_temp)):
                if df_temp.loc[j, 'group'] == -1:
                    rfc_comparar = df_temp.loc[j, 'RFC']
                    nombre_comparar = df_temp.loc[j, 'LEMMA_SPA']
                    # If RFC is the same, then group
                    if rfc_base == rfc_comparar:
                        df_temp.loc[j, 'group'] = grupo_actual
                    else:
                      # Ignore words order (token_sort_ratio)
                      score = fuzz.token_sort_ratio(nombre_base, nombre_comparar)
                      if score >= umbral:
                          df_temp.loc[j, 'group'] = grupo_actual

            grupo_actual += 1

    # Order by "group" and drop this column
    return df_temp.sort_values('group').drop(columns=['group']).reset_index(drop=True)

In [93]:
# Group by RFC and text similarity
df_ordered = group_by_rfc_and_similarity(df,75)

# Reorder columns
df1 = df_ordered[['RFC','NOMBRE','PERSONA','AÑO','LEMMA_SPA','RAZON']].copy()

# Save DataFrame
file_name = f"NuevosRFC"
# file_name = f"NuevosRFC_{csv_file.split('.')[0]}"
df1.to_csv(f'{file_name}.csv', encoding='utf-8', index=False)

### Concat 2 DataFrames (optional)

For simplicity, the same DataFrame will be concated to itself.

In [94]:
df2 = df1.copy()
df2['AÑO'] = year + 1
df2

Unnamed: 0,RFC,NOMBRE,PERSONA,AÑO,LEMMA_SPA,RAZON
0,AAA08091161A,APOYANDO A ANGELITOS CON AUTISMO AC,MORAL,2026,APOYAR A ANGELITO CON AUTISMO AC,"APOYANDO A ANGELITOS CON AUTISMO, A. C."
1,AAA1002249W9,APOYANDO ANGELITOS CON AUTISMO AC,MORAL,2026,APOYAR ANGELITO CON AUTISMO AC,apoyando angelitos con autismo ac
2,AAA090924HJ4,ARGUELLES ALVAREZ & ASOCIADOS SA DE CV,MORAL,2026,ARGUELL ALVAREZ & ASOCIADOS SA DE CV,"ARGUELLES, ALVAREZ & ASOCIADOS SA DE CV"
3,AAA1002249W1,MC SA DE CV,MORAL,2026,MC SA DE CV,MC SA DE CV
4,AAA1002249W2,SA DE CV MC,MORAL,2026,SA DE CV MC,S.A. DE C.V. MC
5,AAA1002249W5,LA PASADITA,MORAL,2026,EL PASADITA,LA PASADITA
6,AAA1002249W5,ABARROTES LA PASADITA,MORAL,2026,ABARROT EL PASADITA,ABARROTES LA PASADITA
7,AAA1002249W6,ADAIR ALONSO ARQUITECTOS SA DE CV,MORAL,2026,ADAIR ALONSO ARQUITECTOS SA DE CV,ADAIR ALONSO ARQUITECTOS SA DE CV
8,AAA1002249W7,WALTMART,MORAL,2026,WALTMART,Waltmart
9,AAA1002249W8,WALTMART DE MEXICO,MORAL,2026,WALTMART DE MEXICO,Waltmart de Mexico


In [95]:
# Concat DataFrames
df_final = pd.concat([df1, df2])
df_final

Unnamed: 0,RFC,NOMBRE,PERSONA,AÑO,LEMMA_SPA,RAZON
0,AAA08091161A,APOYANDO A ANGELITOS CON AUTISMO AC,MORAL,2025,APOYAR A ANGELITO CON AUTISMO AC,"APOYANDO A ANGELITOS CON AUTISMO, A. C."
1,AAA1002249W9,APOYANDO ANGELITOS CON AUTISMO AC,MORAL,2025,APOYAR ANGELITO CON AUTISMO AC,apoyando angelitos con autismo ac
2,AAA090924HJ4,ARGUELLES ALVAREZ & ASOCIADOS SA DE CV,MORAL,2025,ARGUELL ALVAREZ & ASOCIADOS SA DE CV,"ARGUELLES, ALVAREZ & ASOCIADOS SA DE CV"
3,AAA1002249W1,MC SA DE CV,MORAL,2025,MC SA DE CV,MC SA DE CV
4,AAA1002249W2,SA DE CV MC,MORAL,2025,SA DE CV MC,S.A. DE C.V. MC
5,AAA1002249W5,LA PASADITA,MORAL,2025,EL PASADITA,LA PASADITA
6,AAA1002249W5,ABARROTES LA PASADITA,MORAL,2025,ABARROT EL PASADITA,ABARROTES LA PASADITA
7,AAA1002249W6,ADAIR ALONSO ARQUITECTOS SA DE CV,MORAL,2025,ADAIR ALONSO ARQUITECTOS SA DE CV,ADAIR ALONSO ARQUITECTOS SA DE CV
8,AAA1002249W7,WALTMART,MORAL,2025,WALTMART,Waltmart
9,AAA1002249W8,WALTMART DE MEXICO,MORAL,2025,WALTMART DE MEXICO,Waltmart de Mexico


In [98]:
# Sort by 'RFC'
df_final = df_final.sort_values(by="RFC", ascending=True).reset_index(drop=True)
df_ordered = group_by_rfc_and_similarity(df_final,75)
df_final = df_ordered[['RFC','NOMBRE','PERSONA','AÑO','LEMMA_SPA','RAZON']].copy()
print(df_final[['RFC','NOMBRE']])

             RFC                                             NOMBRE
0   AAA08091161A                APOYANDO A ANGELITOS CON AUTISMO AC
1   AAA08091161A                APOYANDO A ANGELITOS CON AUTISMO AC
2   AAA1002249W9                  APOYANDO ANGELITOS CON AUTISMO AC
3   AAA1002249W9                  APOYANDO ANGELITOS CON AUTISMO AC
4   AAA090924HJ4             ARGUELLES ALVAREZ & ASOCIADOS SA DE CV
5   AAA090924HJ4             ARGUELLES ALVAREZ & ASOCIADOS SA DE CV
6   AAA1002249W1                                        MC SA DE CV
7   AAA1002249W1                                        MC SA DE CV
8   AAA1002249W2                                        SA DE CV MC
9   AAA1002249W2                                        SA DE CV MC
10  AAA1002249W5                                        LA PASADITA
11  AAA1002249W5                              ABARROTES LA PASADITA
12  AAA1002249W5                                        LA PASADITA
13  AAA1002249W5                              AB

In [99]:
# Save DataFrame
file_name = f"NuevosRFC_final"
# file_name = f"NuevosRFC_{csv_file.split('.')[0]}"
df_final.to_csv(f'{file_name}.csv', encoding='utf-8', index=False)

## Appendix: Spacy Models (Recommended for Spanish)
Download the corresponding model from "https://github.com/explosion/spacy-models/releases/tag/es_core_news_sm-3.8.0" ("es_core_news_sm-3.8.0-py3-none-any.whl" file) manually and paste it in this project. After that, run:
```
%pip install "es_core_news_sm-3.8.0-py3-none-any.whl"
```
Or run:
```
python -m spacy download es_core_news_sm
```
Notice that the model name is different. If your are using "es_core_news_sm-3.8.0-py3-none-any.whl", run:


In [42]:
# %pip install "es_core_news_sm-3.8.0-py3-none-any.whl"

In [43]:
import spacy # Recommended for Spanish
from thefuzz import fuzz

def calcular_similitud(cadena1, cadena2):
    """
    Compara la similitud de dos cadenas de caracteres y retorna un puntaje de 0 a 100.
    Utiliza partial_token_sort_ratio para manejar variaciones en el orden
    y nombres que contienen subcadenas parciales.
    """
    # Validamos que ambos sean strings
    if not isinstance(cadena1, str) or not isinstance(cadena2, str):
        return 0

    # Aplicamos el método de comparación difusa
    #puntuacion = fuzz.partial_token_sort_ratio(cadena1, cadena2)
    puntuacion = fuzz.token_sort_ratio(cadena1, cadena2)

    return puntuacion

In [44]:
text1 = 'ABARROTES LA PASADITA'
text2 = 'LA PASADITA'

nlp = spacy.load("es_core_news_sm")

# Process text
doc1 = nlp(text1.lower())
doc2 = nlp(text2.lower())

# Tokenization and lemmatization
lemmas1 = [token.lemma_ for token in doc1]
limpio1 = ' '.join(lemmas1)
print(limpio1)

lemmas2 = [token.lemma_ for token in doc2]
limpio2 = ' '.join(lemmas2)
print(limpio2)

score = calcular_similitud(limpio1, limpio2)
print(f"La calificación de similitud es: {score}")

abarrot el pasadita
el pasadita
La calificación de similitud es: 73
