# Iterative updates

In [1]:
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

# Database connection parameters for PostgreSQL
DB_USER = 'postgres'
DB_PASSWORD = 'password'
DB_HOST = 'db'
DB_PORT = '5432'
DB_NAME = 'mydatabase'

# Connect to PostgreSQL
engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Function to check if the table exists in the database
def check_table_exists(engine, table_name):
    query = text(f"""
    SELECT EXISTS (
        SELECT FROM information_schema.tables 
        WHERE table_name = '{table_name}'
    );
    """)
    with engine.connect() as conn:
        result = conn.execute(query).scalar()
    return result

# Function to load CSV data
def load_csv_data(csv_file_path):
    new_data = pd.read_csv(csv_file_path)

    # Handling specific cases (rename column as needed for the employee dataset)
    if csv_file_path == 'data/employees.csv':
        new_data.rename(columns={'employe_id': 'employee_id'}, inplace=True)

    return new_data

# Main function for incremental update
def incremental_update(engine, csv_file_path, table_name, update_keys):
    # Ensure update_keys is a list
    if not isinstance(update_keys, list):
        update_keys = [update_keys]

    # Load the new data from the CSV file
    new_data = load_csv_data(csv_file_path)

    # First time: create the table and insert all data if the table doesn't exist
    if not check_table_exists(engine, table_name):
        print(f"Table '{table_name}' does not exist. Creating it now and uploading the data...")

        # Create the table with the schema based on the CSV data
        new_data.to_sql(table_name, con=engine, if_exists='replace', index=False)
        print(f"Table '{table_name}' created and data uploaded.")

        # Since this is the first run, the total incremental updates are all the rows in the new data
        total_updates = len(new_data)
        print(f"Total incremental updates (first run): {total_updates} records inserted.")
    else:
        print(f"Table '{table_name}' exists. Performing incremental update...")

        # Load the existing data from the database
        existing_data = pd.read_sql_table(table_name, con=engine)

        # Ensure all update_keys exist in both datasets
        for key in update_keys:
            if key not in new_data.columns or key not in existing_data.columns:
                raise KeyError(f"Both the new data and existing data must have the '{key}' column for the incremental update.")

        # Merge the existing data with the new data based on the update_keys
        # We update rows with the same update_keys and append new rows that don't exist
        merged_data = pd.concat([existing_data, new_data]).drop_duplicates(subset=update_keys, keep='last')
        # Count the number of new or updated records
        updated_or_new_data = merged_data[~merged_data[update_keys].apply(tuple, 1).isin(existing_data[update_keys].apply(tuple, 1))]
        total_updates = len(updated_or_new_data)

        # Write the merged DataFrame back to the database (replace existing data)
        updated_or_new_data.to_sql(table_name, con=engine, if_exists='append', index=False)

        print(f"Incremental update completed successfully.")
        print(f"Total incremental updates: {total_updates} records inserted or updated.")


csv_files = [
    {'path': 'data/employees.csv', 'table': 'employees_python', 'keys': ['employee_id','branch_id']},
    {'path': 'data/timesheets.csv', 'table': 'timesheets_python', 'keys': ['timesheet_id']}
]

for file in csv_files:
    incremental_update(engine, file['path'], file['table'], file['keys'])

Table 'employees_python' does not exist. Creating it now and uploading the data...
Table 'employees_python' created and data uploaded.
Total incremental updates (first run): 177 records inserted.
Table 'timesheets_python' does not exist. Creating it now and uploading the data...
Table 'timesheets_python' created and data uploaded.
Total incremental updates (first run): 39714 records inserted.


## EDA

## Timesheets Table

- check is there any duplicate for timesheets tables

In [2]:
import pandas as pd

timesheets = pd.read_sql_table("timesheets_python", con=engine)
# Count the total number of timesheet_id
raw_timesheet_id = timesheets['timesheet_id'].count()

# Count the number of unique timesheet_id
unique_timesheet_id = timesheets['timesheet_id'].nunique()

# Display the results
result = pd.DataFrame({
    'raw_timesheet_id': [raw_timesheet_id],
    'unique_timesheet_id': [unique_timesheet_id]
})

print(result)

if raw_timesheet_id == unique_timesheet_id:
    print("There is no duplicate data.")
else:
    print("There are duplicates in the data.")

# answer: there is no duplicate data

   raw_timesheet_id  unique_timesheet_id
0             39714                39714
There is no duplicate data.


- check is company implements two shifts

In [3]:
import pandas as pd

# Convert the 'checkin' column to datetime if it's not already
timesheets['checkin'] = pd.to_datetime(timesheets['checkin'], errors='coerce')

# Calculate the maximum and minimum checkin times
max_checkin = timesheets['checkin'].max()
min_checkin = timesheets['checkin'].min()

# Display the results
result = pd.DataFrame({
    'max_checkin': [max_checkin],
    'min_checkin': [min_checkin]
})

print(result)

# It seems the company implements two shifts, because the result of checkin time is 24 hours.

  timesheets['checkin'] = pd.to_datetime(timesheets['checkin'], errors='coerce')


          max_checkin         min_checkin
0 2024-10-02 23:59:00 2024-10-02 00:00:14


- check is there any missing value for checkin or checkout column

In [4]:
import pandas as pd

# Filter for rows where checkin or checkout is null
missing_values = timesheets[timesheets['checkin'].isnull() | timesheets['checkout'].isnull()]

# Get the first 5 rows with missing values
result = missing_values.head(5)

# Display the result
print(result)

# Answer: Check for missing values
if not result.empty:
    print("It seems there are missing values in checkin or checkout.")
else:
    print("There are no missing values in checkin or checkout.")

# it seems there is missing value on checkin or checkout

    timesheet_id  employee_id        date             checkin  checkout
3       23907435           63  2019-08-21 2024-10-02 09:55:47      None
7       23907445           60  2019-08-22                 NaT  18:04:33
16      23907459           31  2019-08-26                 NaT  17:57:45
20      23907468           22  2019-08-27                 NaT  18:25:52
21      23907470           21  2019-08-27                 NaT  18:35:22
It seems there are missing values in checkin or checkout.


- check is there any missing value for both of column (checkin and checkout)

In [5]:
import pandas as pd

missing_both = timesheets[timesheets['checkin'].isnull() & timesheets['checkout'].isnull()]

# Display the result
print(missing_both)

# Answer: Check for missing values in both columns
if not missing_both.empty:
    print("There are rows with missing values in both checkin and checkout.")
else:
    print("All good, there is no missing value for both checkin and checkout.")

# all good, there is no missing for both

Empty DataFrame
Columns: [timesheet_id, employee_id, date, checkin, checkout]
Index: []
All good, there is no missing value for both checkin and checkout.


## Employees Table

- check is there any duplicate for employees table

In [6]:
import pandas as pd

employees = pd.read_sql_table("employees_python", con=engine)

# Count the total number of employee_id
raw_total_employee = employees['employee_id'].count()

# Count the number of unique employee_id
unique_total_employee = employees['employee_id'].nunique()

# Display the results
result = pd.DataFrame({
    'unique_total_employee': [unique_total_employee],
    'raw_total_employee': [raw_total_employee]
})

print(result)

# Answer: Check for duplicates
if raw_total_employee > unique_total_employee:
    print("There is duplicate data.")
else:
    print("There are no duplicates in the data.")


# there is duplicate data, noted don't run the first code twice, because 

   unique_total_employee  raw_total_employee
0                    176                 177
There is duplicate data.


# salary_per_hour_calculation

steps:
- remove duplicate on employees table
- create new column for fill missing value from checkin and checkout, 
    assumption:
      - There is two shifts
      - Indonesia salary rates
      - Working day per month is 22
- create prorated calculation, since in the data itself, many of employee have not complete working day / month base on their join_date

In [7]:
import pandas as pd
def salary_per_hour_calculation():
    ### employees ###
    
    # employees = pd.read_csv('data/employees.csv')
    employees = pd.read_sql_table("employees_python", con=engine)
    # rename employe_id to employee_id
    employees.rename(columns={'employe_id': 'employee_id'}, inplace=True)
    
    # Calculate row number based on employee_id, ordered by join_date and salary
    employees['row_num'] = employees.sort_values(['join_date', 'salary'], ascending=[False, False]) \
                      .groupby('employee_id') \
                      .cumcount() + 1
    
    # Select relevant columns
    employees_remove_duplicates = employees[['employee_id', 'branch_id', 'salary', 'join_date', 'resign_date', 'row_num']]
    
    # Filter for rows where row_num is 1
    clean_employees = employees_remove_duplicates[employees_remove_duplicates['row_num'] == 1][['employee_id', 'branch_id', 'salary', 'join_date', 'resign_date']]

    ### timesheets ###
    
    # timesheets = pd.read_csv('data/timesheets.csv')
    timesheets = pd.read_sql_table("timesheets_python", con=engine)
    
    # Define a function to apply the CASE logic for checkin_new
    def get_checkin_new(row):
        if pd.isnull(row['checkin']):
            if pd.to_datetime(row['checkout']).time() > pd.to_datetime('12:00:00').time() and pd.to_datetime(row['checkout']).time() < pd.to_datetime('00:00:00').time():
                return '08:00:00'
            elif pd.to_datetime(row['checkout']).time() > pd.to_datetime('00:00:00').time() and pd.to_datetime(row['checkout']).time() < pd.to_datetime('12:00:00').time():
                return '17:00:00'
        return row['checkin']
    
    # Define a function to apply the CASE logic for checkout_new
    def get_checkout_new(row):
        if pd.isnull(row['checkout']):
            if pd.to_datetime(row['checkin']).time() > pd.to_datetime('00:00:00').time() and pd.to_datetime(row['checkin']).time() < pd.to_datetime('12:00:00').time():
                return '17:00:00'
            elif pd.to_datetime(row['checkin']).time() > pd.to_datetime('12:00:00').time() and pd.to_datetime(row['checkin']).time() < pd.to_datetime('00:00:00').time():
                return '08:00:00'
        return row['checkout']
    
    # Apply the functions to create new columns
    timesheets['checkin_new'] = timesheets.apply(get_checkin_new, axis=1)
    timesheets['checkout_new'] = timesheets.apply(get_checkout_new, axis=1)
    
    # Select relevant columns
    timesheets_modify = timesheets[['timesheet_id', 'employee_id', 'date', 'checkin', 'checkout', 'checkin_new', 'checkout_new']]
    
    def calculate_total_hours(row):
        # Handle NaN values
        checkin_new = pd.to_datetime(row['checkin_new'], format='%H:%M:%S', errors='coerce')
        checkout_new = pd.to_datetime(row['checkout_new'], format='%H:%M:%S', errors='coerce')
        
        # If either checkin_new or checkout_new is NaT, return 9.0
        if pd.isnull(checkin_new) or pd.isnull(checkout_new):
            return 9.0
    
        # Convert times to seconds for easier calculations
        checkin_seconds = (checkin_new.hour * 3600) + (checkin_new.minute * 60) + checkin_new.second
        checkout_seconds = (checkout_new.hour * 3600) + (checkout_new.minute * 60) + checkout_new.second
    
        # Calculate total hours based on conditions
        if checkin_seconds > 0 and checkin_seconds < (12 * 3600):
            return (checkout_seconds - checkin_seconds) / 3600.0
        elif checkin_seconds >= (12 * 3600) and checkin_seconds < (24 * 3600):
            return (checkin_seconds - checkout_seconds) / 3600.0
        elif checkout_seconds > (12 * 3600) and checkout_seconds < (24 * 3600):
            return (checkout_seconds - checkin_seconds) / 3600.0
        elif checkout_seconds > 0 and checkout_seconds < (12 * 3600):
            return (checkin_seconds - checkout_seconds) / 3600.0
        else:
            return 9.0
    
    # Apply the function to create the total_hours column
    timesheets_modify['total_hours'] = timesheets_modify.apply(calculate_total_hours, axis=1)
    
    # Select relevant columns
    timesheets_duration = timesheets_modify[['timesheet_id', 'employee_id', 'date', 'checkin', 'checkout', 'checkin_new', 'checkout_new', 'total_hours']]


    ### join ###

    # Step 1: Perform the left join
    merged_df = pd.merge(timesheets_duration, clean_employees, on='employee_id', how='left')
    
    # Step 2: Filter where salary is not null or not equal to zero
    filtered_df = merged_df[(merged_df['salary'].notnull()) & (merged_df['salary'] != 0)]
    
    filtered_df['date'] = pd.to_datetime(filtered_df['date'])
    # Step 3: Extract year and month from the date
    filtered_df['year'] = filtered_df['date'].dt.year
    filtered_df['month'] = filtered_df['date'].dt.month
    
    # Step 4: Group by employee_id, branch_id, year, and month
    gross_total_hours = (filtered_df.groupby(['employee_id', 'branch_id', 'year', 'month'])
                  .agg(total_day=('employee_id', 'count'),
                       total_hours=('total_hours', 'sum'),
                       salary=('salary', 'min'))
                  .reset_index())
    
    gross_total_hours['prorated_salary'] = gross_total_hours.apply(lambda row: row['salary'] if row['total_day'] > 22 else round((row['total_day'] / 22.0), 2) * row['salary'], axis=1)
    
    # Select relevant columns
    prorated_salary = gross_total_hours[['total_day', 'employee_id', 'branch_id', 'total_hours', 'salary', 'year', 'month', 'prorated_salary']]
    
    # Calculate salary per hour
    prorated_salary['salary_per_hour'] = prorated_salary['prorated_salary'] / prorated_salary['total_hours']
    
    # Select relevant columns
    salary_per_hour = prorated_salary[['total_day', 'employee_id', 'branch_id', 'total_hours', 'salary', 'year', 'month', 'salary_per_hour']]
    
    # Step 1: Group by branch_id, year, and month
    # Step 2: Calculate average salary_per_hour
    branch_salary_per_hour = salary_per_hour.groupby(['branch_id', 'year', 'month'], as_index=False)['salary_per_hour'].mean()
    
    return branch_salary_per_hour
    

In [8]:
df = salary_per_hour_calculation()
df.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prorated_salary['salary_per_hour'] = prorated_salary['prorated_salary'] / prorated_salary['total_hours']


Unnamed: 0,branch_id,year,month,salary_per_hour
count,239.0,239.0,239.0,239.0
mean,2868.891213,2019.736402,7.523013,189698.6
std,2999.106036,0.441509,3.512058,3038951.0
min,1.0,2019.0,1.0,-22311330.0
25%,2590.0,2019.0,4.5,-38379.16
50%,2629.0,2020.0,9.0,50925.79
75%,2633.5,2020.0,10.5,175346.5
max,12722.0,2020.0,12.0,36844660.0


In [9]:
df.head()

Unnamed: 0,branch_id,year,month,salary_per_hour
0,1,2019,8,55178.299946
1,1,2019,9,49162.377224
2,1,2019,10,50028.529886
3,1,2019,11,48132.151362
4,1,2019,12,46341.928185


# Save transform result

In [10]:
import os
import pandas as pd

directory = 'data/transform/python'
file_name = 'salary_per_hours.csv'
if not os.path.exists(directory):
    os.makedirs(directory)

# Save the DataFrame to a CSV file in the directory
csv_file_path = os.path.join(directory, file_name)
df.to_csv(csv_file_path, index=False)

# Load to final table

In [11]:
csv_file_path = 'data/transform/python/salary_per_hours.csv'  # Change the path as needed
table_name = 'salary_per_hours_python'
update_keys = ['branch_id','year','month']
incremental_update(engine, csv_file_path, table_name, update_keys)

Table 'salary_per_hours_python' does not exist. Creating it now and uploading the data...
Table 'salary_per_hours_python' created and data uploaded.
Total incremental updates (first run): 239 records inserted.


In [12]:
final_df = pd.read_sql('SELECT year, month, branch_id, salary_per_hour FROM salary_per_hours_python',engine)

In [13]:
final_df.describe()

Unnamed: 0,year,month,branch_id,salary_per_hour
count,239.0,239.0,239.0,239.0
mean,2019.736402,7.523013,2868.891213,189698.6
std,0.441509,3.512058,2999.106036,3038951.0
min,2019.0,1.0,1.0,-22311330.0
25%,2019.0,4.5,2590.0,-38379.16
50%,2020.0,9.0,2629.0,50925.79
75%,2020.0,10.5,2633.5,175346.5
max,2020.0,12.0,12722.0,36844660.0


In [14]:
final_df.head()

Unnamed: 0,year,month,branch_id,salary_per_hour
0,2019,8,1,55178.299946
1,2019,9,1,49162.377224
2,2019,10,1,50028.529886
3,2019,11,1,48132.151362
4,2019,12,1,46341.928185
