In [28]:
import pandas as pd
from sqlalchemy import create_engine
from configparser import ConfigParser

# Load DB creds from .ini
def get_sqlalchemy_engine(config_file, db_name):
    config = ConfigParser()
    config.read(config_file)
    db = config['mysql']
    return create_engine(
        f"mysql+mysqlconnector://{db['user']}:{db['password']}@{db['host']}/{db_name}"
    )

source_engine = get_sqlalchemy_engine('querycrew.ini', 'querycrew_db')
target_engine = get_sqlalchemy_engine('querycrew.ini', 'querycrew_wh')

In [29]:
def clean_generic(df):
    df = df.drop_duplicates()
    df.columns = df.columns.str.strip()
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip().str.title()
        df[col] = df[col].replace(['None', 'Nan'], pd.NA)
    return df.dropna()

def clean_customers(df):
    df = df.drop_duplicates()
    df['first_name'] = df['first_name'].str.strip().str.title()
    df['last_name'] = df['last_name'].str.strip().str.title()
    df['email'] = df['email'].str.lower().replace(['none', 'nan'], pd.NA)
    df['phone'] = df['phone'].replace(['None', 'none', 'nan'], pd.NA)
    df['address'] = df['address'].str.title()
    return df.dropna(subset=['email', 'phone'])

def clean_dealer_sales_summary(df):
    # Standardize dealer_name
    df['dealer_name'] = df['dealer_name'].astype(str).str.strip().str.title()

    # Drop missing dealer_name rows
    df = df[df['dealer_name'].notna()]
    df = df[df['dealer_name'].str.lower() != 'none']

    # Group by dealer_name and sum numeric fields
    numeric_cols = df.select_dtypes(include='number').columns
    df_clean = df.groupby('dealer_name', as_index=False)[numeric_cols].sum()

    return df_clean

In [30]:
def etl_table(table_name):
    print(f"\nProcessing: {table_name}")

    df = pd.read_sql(f"SELECT * FROM {table_name}", source_engine)

    if table_name == 'Customers':
        df_clean = clean_customers(df)
    elif table_name == 'dealer_sales_summary':
        df_clean = clean_dealer_sales_summary(df)
    else:
        df_clean = clean_generic(df)

    # Fast clean load to target WH
    df_clean.to_sql(name=table_name, con=target_engine, if_exists='replace', index=False, method='multi')

    print(f"{table_name} cleaned and loaded into querycrew_wh")

In [31]:
tables = ['Car', 'car_sales', 'Customers', 'Date', 'dealer_sales_summary', 'EV_arrivals']
for tbl in tables:
    etl_table(tbl)


Processing: Car


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str).str.strip().str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].replace(['None', 'Nan'], pd.NA)


Car cleaned and loaded into querycrew_wh

Processing: car_sales


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str).str.strip().str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].replace(['None', 'Nan'], pd.NA)


car_sales cleaned and loaded into querycrew_wh

Processing: Customers


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['first_name'] = df['first_name'].str.strip().str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['last_name'] = df['last_name'].str.strip().str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['email'] = df['email'].str.lower().replace(['none', 'nan'], pd.NA)
A value is trying

Customers cleaned and loaded into querycrew_wh

Processing: Date


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str).str.strip().str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].replace(['None', 'Nan'], pd.NA)


Date cleaned and loaded into querycrew_wh

Processing: dealer_sales_summary
dealer_sales_summary cleaned and loaded into querycrew_wh

Processing: EV_arrivals


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str).str.strip().str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].replace(['None', 'Nan'], pd.NA)


EV_arrivals cleaned and loaded into querycrew_wh
