# Email Data Pipeline

This notebook extracts data from qardio google sheets, cleans the data, and loads it into the Qardio SQL database

In [1]:
from google_apis import gsheets_functions as gs
import pandas as pd
import sqlalchemy

database = 'qardio'
host = '127.0.0.1'
user = 'root'
password = 'Party100'
url = f'mysql+mysqlconnector://{user}:{password}@{host}/{database}'
engine = sqlalchemy.create_engine(url, echo=True)
conn = engine.connect()

2023-06-22 08:23:52,454 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2023-06-22 08:23:52,455 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-06-22 08:23:52,458 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2023-06-22 08:23:52,459 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-06-22 08:23:52,461 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2023-06-22 08:23:52,462 INFO sqlalchemy.engine.Engine [raw sql] {}


In [2]:
qardio_data = gs.gspread_read(wb='1dKiIp2ETyzfKHzR9fMeHrgf2N_UKcSa5uPWathJzgIw',
                              ws='22_+_23')

Clean column headers

In [3]:
qardio_data.columns = qardio_data.columns.str.strip()
qardio_data.columns = qardio_data.columns.str.lower()
qardio_data.columns = qardio_data.columns.str.replace(' ', '_')
qardio_data.columns = qardio_data.columns.str.replace('.', '')

Solve dtypes

In [4]:
qardio_data.columns

Index(['campaign_name', 'year', 'date_sent', 'day', 'time_pst', 'subject_line',
       'subscribers', 'country', 'recipients', 'opens', 'open_rate', 'clicks',
       'click_rate', 'true_click_rate', 'click_per_unique_opens', 'unsubs',
       '%_unsubs', '%_unsubs_openers', 'bounced', 'sessions',
       'sess_per_unique_opens', 'ecr', 'total_trans', 'revenue', 'aov',
       'qardioarm', 'qardiobase_2', 'qardiobase_x', 'qardiotemp', 'qardiospo2',
       'qa_case', 'qa+qbx_bundle', 'qa+qb_bundle', 'qa+spo2', 'qtemp+spo2',
       'ultimate_bundle', 'qardio_core', 'total_quantity'],
      dtype='object')

In [5]:
qardio_data = qardio_data.replace('#DIV/0!', 0)
qardio_data = qardio_data.replace('%', '')
qardio_data.date_sent = pd.to_datetime(qardio_data.date_sent)
# qardio_data.open_rate = qardio_data.open_rate.astype('float')
qardio_data.clicks = pd.to_numeric(qardio_data.clicks, errors='coerce', downcast='integer')

qardio_data.dtypes

campaign_name                     object
year                               int64
date_sent                 datetime64[ns]
day                               object
time_pst                          object
subject_line                      object
subscribers                       object
country                           object
recipients                         int64
opens                              int64
open_rate                         object
clicks                           float64
click_rate                        object
true_click_rate                   object
click_per_unique_opens            object
unsubs                             int64
%_unsubs                          object
%_unsubs_openers                  object
bounced                            int64
sessions                          object
sess_per_unique_opens             object
ecr                               object
total_trans                       object
revenue                           object
aov             

In [6]:
qardio_data.click_rate.unique()

array(['0.29%', '0.23%', '0.11%', '0.15%', '0.42%', '0.14%', '0.28%',
       '1.36%', '1.15%', '1.42%', '0.20%', '0.45%', '0.26%', '0.47%',
       '1.38%', '0.94%', '0.63%', '0.31%', '0.24%', '1.06%', '0.40%',
       '2.45%', '1.85%', '1.81%', '0.52%', '1.24%', '0.56%', '2.76%',
       '0.50%', '0.22%', '0.16%', '1.19%', '0.77%', '1.82%', '0.25%',
       '0.57%', '1.22%', '0.82%', '0.44%', '0.18%', '0.17%', '0.53%',
       '1.88%', '1.13%', '1.55%', '0.73%', '0.69%', '1.66%', '0.43%',
       '0.34%', '0.21%', '0.19%', '0.85%', '0.30%', '1.31%', '0.93%',
       '1.37%', '0.37%', '1.26%', '0.61%', '0.36%', '0.12%', '0.55%',
       '1.03%', '1.28%', '0.49%', '0.33%', '0.39%', '0.41%', '0.13%',
       '1.08%', '0.35%', '1.00%', '0.27%', '1.12%', '2.34%', '1.60%',
       '2.13%', '1.21%', '2.55%', '0.75%', '1.25%', '1.30%', '3.49%',
       '3.05%', '0.65%', '1.65%', '1.86%', '0.87%', '1.76%', '1.11%',
       '2.16%', '2.26%', '2.21%', '0.32%', '0.66%', '0.62%', '0.98%',
       '1.44%', '1.8

## Save data to SQL database

In [7]:
qardio_data.to_sql(name='email_data',
          con=conn,
          if_exists='replace')

2023-06-22 08:23:55,361 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-06-22 08:23:55,373 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2023-06-22 08:23:55,375 INFO sqlalchemy.engine.Engine [generated in 0.00304s] {'table_schema': 'qardio', 'table_name': 'email_data'}
2023-06-22 08:23:55,379 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2023-06-22 08:23:55,380 INFO sqlalchemy.engine.Engine [cached since 0.007673s ago] {'table_schema': 'qardio', 'table_name': 'email_data'}
2023-06-22 08:23:55,383 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `qardio`
2023-06-22 08:23:55,383 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-06-22 08:23:55,402 INFO sqlalchemy.engine.Engine SHOW CREATE TABLE `email_data`
2023-06-22 08:23:55,403 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-06-22 0