## Define variable

In [24]:
import pandas as pd
import os
from unidecode import unidecode
import re
import chardet
from datetime import datetime

file_path = os.getcwd()

class repair_emails:
    def __init__(self):
        self.dd = None
        self.file = None
        self.valid_emails = None
        self.invalid_emails = None

    def get_file_path(self, vFile_path):
        
        '''
        Get the path of the first csv file in the specified directory.
        Args: 
            vFile_path (str): The path of the directory to search for CSV files.
        Returns:
            self.file (str): the full name of the file
        '''
        
        if os.path.exists(vFile_path):
            csv_files = [x for x in os.listdir(vFile_path) if x.endswith('.csv')]
            if csv_files:
                self.file = os.path.join(vFile_path, csv_files[0])
            else:
                print('There are no CSV files in the directory.')
        else:
            print('Path does not exist.')

        print('--- Identified file:\n' + self.file)

    def load_csv(self):
        
        '''
        Read this file, make replacement changes in Email column.
        Args: 
            self.file (str): the full name of the file
        Returns:
            self.dd (dataframe): with updated infos about email
        '''
        
        self.dd = pd.read_csv(self.file, sep=';', encoding='latin-1', dtype=object)
        self.dd['email'] = self.dd['email'].str.replace(' ', '').str.replace('ï¿½', 'ç').str.replace('@gmailcom', '@gmail.com')
        print('Dataframe loaded in: ' + self.file)

    def split_valid_emails(self, email_column):
        
        """
        Args:
        dataframe (pd.DataFrame): original dataframe.
        email_column (str): columns with email.

        Returns:
        (pd.DataFrame, pd.DataFrame): A tuple containing two DataFrames: one with valid emails and one with invalid emails.
        """
        
        # Regular expression pattern to validate email addresses
        email_pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'

        # Use regular expression to filter valid and invalid emails
        self.valid_emails = self.dd[self.dd[email_column].str.match(email_pattern, na=False)]
        self.invalid_emails = self.dd[~self.dd[email_column].str.match(email_pattern, na=False)]

        return print('--- Generated two dataframes.') # valid_emails, invalid_emails

    
    def save_file(self):
        
        # file name
        current_datetime = datetime.now()
        formatted_datetime = current_datetime.strftime("%Y%m%d")
        file_name = f"{formatted_datetime}_Base_Newsletter_Estabs.csv"
        invalid_file_name = f"{formatted_datetime}_invalid_emails.csv"
        
        # save files
        self.valid_emails.to_csv(file_name,sep=';',index=False)
        self.invalid_emails.to_csv(invalid_file_name,sep=';',index=False)
        
        # save files in sharepoint
#         file_sharep = 'C:\\Users\\patricia.dasilva.ext\\SODEXO\\Digital - Bases\\Merchant\\Envio\\' + file_name
#         self.valid_emails.to_csv(file_sharep,sep=';',index=False)
        
        return print('--- Saved!')

## Run function

In [25]:
RunRepair_emails = repair_emails()
RunRepair_emails.get_file_path(file_path)
RunRepair_emails.load_csv()
RunRepair_emails.split_valid_emails('email')
RunRepair_emails.save_file()

--- Identified file:
C:\Users\patricia.dasilva.ext\SODEXO\Data Operations - General\Chamados\Recorrências\Mailling Recorrente (Mensal) - Base Newsletter  Estabelecimentos\20231003_Base_Newsletter_Estabs.csv
Dataframe loaded in: C:\Users\patricia.dasilva.ext\SODEXO\Data Operations - General\Chamados\Recorrências\Mailling Recorrente (Mensal) - Base Newsletter  Estabelecimentos\20231003_Base_Newsletter_Estabs.csv
--- Generated two dataframes.
--- Saved!
