In [1]:
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
import importlib
import sqlite3

# keeping company information in additional file
import data_file
importlib.reload(data_file)

# Get current year and month
now = datetime.now()

# Subtract one month from current date
last_month = now - relativedelta(months=0)  #

# Format last month as string in YYYYMM format
year_month = last_month.strftime("%Y%m")

print(year_month)

# Construct file path
file_path = f"data_files/{year_month}.xlsx"

# Read xlsx file into pandas DataFrame
df = pd.read_excel(file_path)

202309


In [2]:
my_company_name = data_file.my_company_name
exclusion_company_one = data_file.exclusion_company_one
exclusion_company_two = data_file.exclusion_company_two
exclusion_company_three = data_file.exclusion_company_three

last_column_name = 'order_intake_amount_eur'
df = df.rename(columns={df.columns[-1]: last_column_name})

# necessary modifications of columns
df['bu'] = df['bu'].astype(str)
df['customer_name'] = df['sold_to_customer_n'].str.upper()

In [3]:
target_values = data_file.m_target_values
include_values = data_file.m_include_values
condition = (df['ship_to_customer_n'].isin(target_values)) & (df['sold_to_customer_n'] == include_values[0])

In [4]:
df.loc[condition, 'sold_to_customer_n_latest'] = df['ship_to_customer_n']
df.loc[condition, 'sold_to_customer'] = df['ship_to_customer']

In [5]:
# Filter out rows with unwanted value in customer_name
df = df[~df['customer_name'].str.contains(my_company_name) & ~df['customer_name'].str.contains(exclusion_company_one) & ~df['customer_name'].str.contains(exclusion_company_two) & ~df['customer_name'].str.contains(exclusion_company_three)]

In [6]:
# visual check - sums should be the same or very close ~ 1
df.groupby(['source'])['order_intake_amount_eur'].sum()[0] / df.groupby(['source'])['order_intake_amount_eur'].sum()[1]

0.9995360873403911

In [7]:
#combining bu products groups for PRJ and BU sources
prj_bu = data_file.prj_bu
prd_bu = data_file.prd_bu

In [8]:
bu_from_source = set(df.bu.unique())
bu_from_my_file = set(prj_bu + prd_bu)
unique_to_set1 = bu_from_source - bu_from_my_file
print('should be 999 only ->', unique_to_set1)

should be 999 only -> {'YY199', '999'}


In [9]:
df_bu = df[df['source']=='BU']
df_prj = df[df['source']=='PRJ']
unique_so_bu = set(df_bu['sales_order_so'].unique())
unique_so_prj = set(df_prj['sales_order_so'].unique())

In [10]:
diff_bu_not_prj = unique_so_bu - unique_so_prj
diff_prj_not_bu = unique_so_prj - unique_so_bu
combined_diff = diff_bu_not_prj.union(diff_prj_not_bu)
diff_bu_not_prj_list = list(diff_bu_not_prj)
diff_prj_not_bu_list = list(diff_prj_not_bu)
combined_diff_list = list(combined_diff)

In [None]:
df[df['sales_order_so'].isin(combined_diff_list)]

In [12]:
# delete doubled rows from difficrent sources PRJ and BU
list_prj_so = df[df['bu'].isin(prj_bu)]['sales_order_so'].to_list()
df_wo_prj_so = df[(~df['sales_order_so'].isin(list_prj_so)) & (df['source'] == 'BU')]
df_with_prj_so = df[df['bu'].isin(prj_bu)]
# keeping only unique sales orders
df_final = pd.concat([df_wo_prj_so, df_with_prj_so])

In [13]:
# visual check - sums should be the same or very close ~ 1
df_final['order_intake_amount_eur'].sum() / df.groupby(['source'])['order_intake_amount_eur'].sum()[0]

1.0000000006134726

In [14]:
# Create an ExcelWriter object
writer = pd.ExcelWriter(f'data_files/outcome/month_results_table_{year_month}.xlsx')
# Save each DataFrame to a separate sheet in the same file
df_final.to_excel(writer, sheet_name=f'{year_month}', index=False)
# Save the file
writer.close()

In [15]:
len(df_final)

44323

In [16]:
import numpy as np
import sqlite3
conn2 = sqlite3.connect('data_files/customer_data.db')
query = "SELECT * FROM customers"  # Replace 'tablename' with your table name
df_customers = pd.read_sql_query(query, conn2)
conn2.close()

print(len(df_customers))

15025


In [17]:
# Extract the difference by subtracting the common part
df_customers['legal_form'] = ''
df_customers['sold_to_customer_n'] = df_customers['sold_to_customer_n'].str.upper()
for i, row in df_customers.iterrows():
    common_part = row['customer_name']
    if common_part in row['sold_to_customer_n']:
        df_customers.at[i, 'legal_form'] = row['sold_to_customer_n'].replace(common_part, '').strip()

legal_forms = df_customers['legal_form'].tolist()
legal_forms = list(set(legal_forms))

In [18]:
list_of_existing_customer = df_customers['sold_to_customer'].tolist()
new_customers = df_final[~df_final['sold_to_customer'].isin(list_of_existing_customer)]

In [19]:
# reading table of definition for customer types
def_for_customers = pd.read_excel('data_files/def_for_customers.xlsx')

In [20]:
new_customers = new_customers.loc[:, ['sold_to_customer', 'company_code_n', 'sold_to_customer_n', 'customer_name','customer_group_code', 'sold_to_country_n']]
new_customers.drop_duplicates(inplace=True)
new_customers['customer_group_code'] = new_customers['customer_group_code'].fillna(0)
new_customers = new_customers.merge(def_for_customers, how='left')

new_customers['customer_name_prev'] = new_customers['customer_name']
new_customers['legal_form'] = ''
for i, row in new_customers.iterrows():
    common_part = row['customer_name']
    coinc = []
    for lf in legal_forms:
        if lf in common_part:
            coinc.append(lf)
    
    longest_element = max(coinc, key=len)
    new_customers.at[i, 'customer_name'] = row['customer_name'].replace(longest_element, '').strip()
    new_customers.at[i, 'legal_form'] = longest_element

# Create an ExcelWriter object
writer = pd.ExcelWriter(f'data_files/outcome/new_customers_{year_month}.xlsx')
# Save each DataFrame to a separate sheet in the same file
new_customers.to_excel(writer, sheet_name='new_customers', index=False)
# Save the file
writer.close()

In [None]:
# NOW it is necessary to open the file and check legal forms determination, then save the file and read it again
# rename countries

In [25]:
# after visual check of legal forms separation
checked_new_customers = pd.read_excel(f'data_files/outcome/new_customers_{year_month}.xlsx')

new_customer_list = checked_new_customers['customer_name'].tolist()
print('new customers:' , len(new_customer_list))

conn2 = sqlite3.connect('data_files/customer_data.db')
query = "SELECT * FROM customers" 
df_customers = pd.read_sql_query(query, conn2)
conn2.close()

df_customers_to_check = df_customers[df_customers['customer_name'].isin(new_customer_list)]

existing_list = list(set(df_customers_to_check['customer_name'].tolist()))

existing_in_new = checked_new_customers[checked_new_customers['customer_name'].isin(existing_list)]
print('already exists:', len(existing_in_new))

checked_new_customers = checked_new_customers[~checked_new_customers['customer_name'].isin(existing_list)]
print('new:', len(checked_new_customers))

existing_in_new = existing_in_new.loc[:, ['sold_to_customer', 'company_code_n', 'sold_to_customer_n',
       'customer_name', 'customer_group_code', 'countries']]

df_customers_for_use = df_customers.loc[:, ['customer_name', 'indirect_direct', 'channel',
       'type', 'tier']]

df_customers_for_use.drop_duplicates(inplace=True)

existing_in_new = existing_in_new.merge(df_customers_for_use, how='left')
print('existing', len(existing_in_new))

checked_new_customers = pd.concat([checked_new_customers, existing_in_new])
print('new customers added:' , len(checked_new_customers))

checked_new_customers.loc[checked_new_customers['tier'].isna(), 'tier'] = 'Direct'
checked_new_customers['tier_new'] = checked_new_customers['tier']

checked_new_customers = checked_new_customers.loc[:, 'sold_to_customer':]

checked_new_customers.reset_index(inplace=True)

new customers: 13


In [37]:
duplicated_rows = checked_new_customers[checked_new_customers['customer_name'].duplicated(keep=False)]
if len(duplicated_rows) > 0:
    print('here')
    writer = pd.ExcelWriter(f'data_files/outcome/check_doubled.xlsx')
    checked_new_customers.to_excel(writer, sheet_name='doubled')
    writer.close()

In [37]:
indexes_to_drop = []

for i in indexes_to_drop:
    checked_new_customers = checked_new_customers.drop(i) # check shift...

In [40]:
checked_new_customers = checked_new_customers[['sold_to_customer', 'company_code_n', 'sold_to_customer_n',
       'customer_name', 'customer_group_code', 'indirect_direct', 'channel',
       'type', 'tier', 'legal_form', 'countries',
       'tier_new']]

checked_new_customers = checked_new_customers.rename(columns={'customer_group_code':'code'})
checked_new_customers['Comments'] = year_month

df = checked_new_customers.copy()

# Connect to the SQLite database
conn = sqlite3.connect('data_files/customer_data.db')
table_name = 'customers'

existing_table = pd.read_sql_query(f"SELECT * FROM {table_name} LIMIT 0", conn)

# Find the columns in the DataFrame that are not in the existing table
new_columns = [column for column in df.columns if column not in existing_table.columns]

# Add the missing columns to the existing table (alter table query)
for column in new_columns:
    dtype = str(df[column].dtype)
    alter_query = f"ALTER TABLE {table_name} ADD COLUMN {column} {dtype}"
    conn.execute(alter_query)
    
    
# Insert the DataFrame data into the existing table
df.to_sql(table_name, conn, if_exists='append', index=False)

# Close the connection to the database
conn.close()

In [41]:
# update orders of last period according to the initial order table
df_final = df_final.drop('customer_name', axis=1)
df_final = df_final.rename(columns={'bu':'bu2'})
df_final['Date'] = pd.to_datetime(df_final['year_month'], format='%Y%m')
df_final['FY'] = df_final['Date'] - pd.DateOffset(months=3)

In [42]:
conn = sqlite3.connect('data_files/order_data.db')
query = "SELECT * FROM orders"
df_orders = pd.read_sql_query(query, conn)
# Close the connection to the database
conn.close()

print(len(df_orders))

222794


In [55]:
def generate_year_month_list(year_month):
    # Parse the year and month from the input string
    year = int(year_month[:4])
    month = int(year_month[4:6])

    # Start from '202304'
    start_year = 2023
    start_month = 4

    result = []

    while (year, month) != (start_year, start_month - 1):  # Subtract 1 to include the end month in the loop
        result.append(f"{start_year:04}{start_month:02}")
        start_month += 1
        if start_month > 12:
            start_month = 1
            start_year += 1

    return result

In [49]:
# clean data base from entries for this period

year_month_list = generate_year_month_list(year_month)

print(year_month)

def delete_rows(db_file, table_name, column_name, value):
    # Connect to the database
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()

    # Execute the DELETE statement to remove rows
    delete_query = f"DELETE FROM {table_name} WHERE {column_name} = ?"
    cursor.execute(delete_query, (value,))

    # Commit the changes and close the connection
    conn.commit()
    conn.close()

# Provide the necessary details
database_file = "data_files/order_data.db"
table_name = "orders"
column_name = "year_month"
#value = year_month

# Call the function to delete rows
for y in year_month_list:
    print(y)
    delete_rows(database_file, table_name, column_name, y)

202309


In [56]:
# updating db with orders

conn = sqlite3.connect('data_files/order_data.db')
query = "SELECT * FROM orders"  # 

#df_orders = pd.read_sql_query(query, conn)

#copy df with new results
df = df_final.copy()

table_name = 'orders'

existing_table = pd.read_sql_query(f"SELECT * FROM {table_name} LIMIT 0", conn)

# Find the columns in the DataFrame that are not in the existing table
new_columns = [column for column in df.columns if column not in existing_table.columns]

# Add the missing columns to the existing table (alter table query)
for column in new_columns:
    dtype = str(df[column].dtype)
    alter_query = f"ALTER TABLE {table_name} ADD COLUMN {column} {dtype}"
    conn.execute(alter_query)
    
    
# Insert the DataFrame data into the existing table
df.to_sql(table_name, conn, if_exists='append', index=False)

# Close the connection to the database
conn.close()

In [57]:
conn = sqlite3.connect('data_files/order_data.db')
query = "SELECT * FROM orders"
df_orders = pd.read_sql_query(query, conn)
# Close the connection to the database
conn.close()

print(len(df_orders))

244475


In [58]:
writer = pd.ExcelWriter(f'data_files/outcome/df_orders_check.xlsx')
df_orders.to_excel(writer, sheet_name='orders', index=False)
writer.close()

In [36]:
# now it is necessary to fill tier_new with value from tier
# Connect to the database
conn2 = sqlite3.connect('data_files/customer_data.db')

# Update the tier_new column with values from tier where tier_new is empty
update_query = "UPDATE customers SET tier_new = tier WHERE tier_new IS NULL"
conn2.execute(update_query)
conn2.commit()

# Close the connection
conn2.close()

In [None]:
# check in case

conn = sqlite3.connect('data_files/order_data.db')
query = "SELECT * FROM orders"  # Replace 'tablename' with your table name
df_orders = pd.read_sql_query(query, conn)
conn.close()

conn2 = sqlite3.connect('data_files/customer_data.db')
query = "SELECT * FROM customers"  # Replace 'tablename' with your table name
df_customers = pd.read_sql_query(query, conn2)
conn2.close()

df_customers['sold_to_customer'] = df_customers['sold_to_customer'].astype(str)
unique_values = df_customers['sold_to_customer'].unique().tolist()

df_orders[~df_orders['sold_to_customer'].isin(unique_values)]

In [59]:
conn2 = sqlite3.connect('data_files/customer_data.db')
query = "SELECT * FROM customers" 
df_customers = pd.read_sql_query(query, conn2)
conn2.close()

writer = pd.ExcelWriter(f'data_files/outcome/df_customers_check.xlsx')
df_customers.to_excel(writer, sheet_name='customers', index=False)
writer.close()