In [26]:
# import all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
import logging

In [27]:
def clean_data(df):
    logging.info(f"Starting data cleaning process")
    
    try:
        # Remove duplicates
        initial_rows = len(df)
        df.drop_duplicates(inplace=True)
        logging.info(f"Removed {initial_rows - len(df)} duplicate rows")
        
        # Handle missing values
        df.fillna('Unknown', inplace=True)
        logging.info("Filled missing values with 'Unknown'")
        
        # Standardize formats (example: date)
        df['date'] = pd.to_datetime(df['date'], errors='coerce').dt.strftime('%Y-%m-%d')
        logging.info("Standardized date format")
        
        # Data validation (example: ensure message_id is numeric)
        df['message_id'] = pd.to_numeric(df['message_id'], errors='coerce')
        df.dropna(subset=['message_id'], inplace=True)
        logging.info("Validated message_id column")
        
        return df
    except Exception as e:
        logging.error(f"Error during data cleaning: {str(e)}")
        return None


In [30]:
def main():
    df1 = pd.read_csv('../../data/DoctorsET.csv')
    df2 = pd.read_csv('../../data/EAHCI.csv')
    df3 = pd.read_csv('../../data/lobelia4cosmetics.csv')
    df4 = pd.read_csv('../../data/yetenaweg.csv')


    logging.info(f"Head of df1: {df1.head()}")
    print(df1.head())

    logging.info(f"Head of df2: {df2.head()}")
    print(df2.head())

    logging.info(f"Head of df3: {df3.head()}")
    print(df3.head())

    logging.info(f"Head of df4: {df4.head()}")
    print(df4.head())


    # merge the dataframes
    df = pd.concat([df1, df2, df3, df4], ignore_index=True)

    # export the merged dataframe to the dbt project
    df.to_csv('../../data/merged_data.csv', index=False)

    df = pd.read_csv('../../data/merged_data.csv')

    # clean the data
    df = clean_data(df)
    print(df.head())

if __name__ == "__main__":
    main()

   message_id                       date      sender_id  \
0         864  2023-12-18 17:04:02+00:00 -1001102021238   
1         863  2023-11-03 16:14:39+00:00 -1001102021238   
2         862  2023-10-02 16:37:39+00:00 -1001102021238   
3         861  2023-09-16 07:54:32+00:00 -1001102021238   
4         860  2023-09-01 16:16:15+00:00 -1001102021238   

                                             message  
0  https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...  
1  ·ã∂·ä≠·â∞·à≠·àµ ·ä¢·âµ·ãÆ·åµ·ã´ ·â† ·ä†·ã≤·àµ ·ä†·âÄ·à´·à®·â• ·â† ·â¥·àå·â™·ã•·äï ·çï·àÆ·åç·à´·àô·äï ·àà·àò·åÄ·àò·à≠ ·ä®...  
2  ·àû·âµ ·â†·àµ·ä≥·à≠ \n\n·àà·àç·åÜ·âª·âΩ·äï ·ã®·àù·äì·à≤·ãò·ãç ·àù·à≥·âÉ ·à≥·äì·âÄ·ãç ·ä•·ãµ·àö·ã´·â∏·ãç·äï ·ã≠·âÄ·äï...  
3  ·ä® HIV ·ã®·â∞·çà·ãà·à∞ ·à∞·ãç ·ä†·åã·å•·àü·âΩ·àÅ ·ã´·âÉ·àç ? ·çà·ãç·àµ ·ä•·äì ·àÖ·ä≠·àù·äì ?\n\n·àô...  
4  ·â†·âÖ·à≠·â• ·åä·ãú ·â†·àÉ·åà·à´·âΩ·äï ·àã·ã≠ ·ä•·ã®·â∞·àµ·â∞·ãã·àà ·ã´·àà ·ã®·â∞·àò·à≥·à≥·ã≠ ·çÜ·â≥ ( Homos...  
   message_id                       date      sender_id