# Import the libraries that are necesaries

In [1]:
import pandas as pd
import getpass  # To get the password without showing the input
from sqlalchemy import create_engine
import yaml
import sys
import os

# Add the project root directory to the Python path
sys.path.append(os.path.abspath('../'))

# Load functions from ab_clean.py
from src.functions.ab_clean import *

In [2]:
# Load configuration from config.yaml
with open('../config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)

## Get the necesary information to connect to mysql
 ** important note remmember the password that you use for mysql**

In [3]:
# Specify your MySQL database connection details
password = getpass.getpass()
user = 'root'
database_name = 'ab_test'
engine = connect_to_db(user, password, database_name)

### Read the CSVs

In [4]:
# Read CSVs using paths from config
data_demo = pd.read_csv(config['data_paths']['clean_demo'])
data_experiment_clients = pd.read_csv(config['data_paths']['clean_experiment'])
data_web_pt_1 = pd.read_csv(config['data_paths']['clean_web_data_1'])
data_web_pt_2 = pd.read_csv(config['data_paths']['clean_web_data_2'])

# Import the data of all datasets

### Insert the data to the final demo table

In [5]:
# Insert df_final_demo data
insert_to_table(data_demo, engine, 'df_final_demo')

### Put only the client_id that exist in the final demo

In [6]:
# Find client IDs in df_final_demo that are not missing in df_final_experiment_clients
non_missing_clients = data_experiment_clients[data_experiment_clients['client_id'].isin(data_demo['client_id'])]

# Insert non-missing clients into df_missing_clients table
insert_to_table(non_missing_clients, engine, 'df_final_experiment_clients')

### Merge and separate the data

In [7]:
# Merge df_final_web_data_pt_1 and df_final_web_data_pt_2
merged_data = pd.concat([data_web_pt_1, data_web_pt_2], ignore_index=True)

In [8]:
# Filter and insert to df_final_web_data table
unique_client_ids = data_demo['client_id'].unique()
filtered_data = filter_data(merged_data, unique_client_ids)
reset_index(filtered_data)
insert_to_table(filtered_data, engine, 'df_final_web_data')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns={'index': 'id'}, inplace=True)


In [9]:
# Find and insert removed data to df_final_web_data_missing table
removed_data = merged_data[~merged_data['client_id'].isin(unique_client_ids)]
reset_index(removed_data)
insert_to_table(removed_data, engine, 'df_final_web_data_missing')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.rename(columns={'index': 'id'}, inplace=True)
