In [242]:
import os
import openpyxl
import pandas as pd

# Specify the folder path
folder_path = 'data_files/'

# List all files and folders in the specified folder path
all_files_and_folders = os.listdir(folder_path)

# Filter out only the files (exclude folders)
files = [f for f in all_files_and_folders if os.path.isfile(os.path.join(folder_path, f))]

# Print the list of files
print(files)

for file in files: 
    if '.xlsx' in file:
        # Load the workbook
        workbook = openpyxl.load_workbook(folder_path + file)
        # Get all sheet names
        sheet_names = workbook.sheetnames
        for sheet_name in sheet_names:
            print(sheet_name)

['.DS_Store', 'sales_gp_report.xlsx', 'definition_table.xlsx', 'sales_gp_last_month.xlsx']
sales_gp_report
custumer_definition_table
horn
sales_gp_last_month


In [243]:
working_dfs = []
for file in files:
    if '.xlsx' in file:
        df = pd.read_excel(folder_path + file)
        name_without_extension = file.split('.')[0]
        df.name = name_without_extension
        working_dfs.append(df)

temp = pd.read_excel(folder_path + 'definition_table.xlsx', sheet_name='horn')
temp.name = 'temp'
working_dfs.append(temp)
        
for dt_f in working_dfs:
    print(dt_f.name, ":", dt_f.columns)

sales_gp_report : Index(['fiscal_year', 'year_month', 'company_code_n', 'customer_group_cd',
       'sales_order_so', 'bu', 'bu_n', 'sales_person_n', 'sold_to_customer',
       'sold_to_customer_n', 'ec_eu_customer', 'ec_eu_customer_n',
       'ec_eu_industry', 'ec_eu_industry_n', 'ec_eu_industry_segment',
       'ec_eu_industry_segment_n', 'Sales (EURO)', 'GP (Euro)', 'BU/PJT',
       'BU Group', 'BU Sub Group', 'OpCo', 'Type', 'Type with Channel',
       'Company Type'],
      dtype='object')
definition_table : Index(['sold_to_customer', 'sales_person', 'company_code_n',
       'sold_to_customer_n', 'customer_name', 'code', 'customer_group_case',
       'indirect_direct', 'channel', 'type', 'Oscar i/d', 'Oscar Type', 'GB',
       'i/d_in_proc.', 'Correction_Channel', 'Correction_ Type', 'T&M', 'tier',
       'account', 'Partner page', 'Tableau', 'Comments', 'Link', 'additional',
       'Date of agreement'],
      dtype='object')
sales_gp_last_month : Index(['fiscal_year', 'year_month

In [252]:
sales_gp_report = working_dfs[0]
definition_table = working_dfs[1]
sales_gp_last_month = working_dfs[2]
temp = working_dfs[3]

In [253]:
sales_gp_report = sales_gp_report.loc[:, ['fiscal_year', 'year_month', 'company_code_n', 'sales_person_n', 'sold_to_customer', 'sales_order_so', 
       'Sales (EURO)', 'GP (Euro)']]
definition_table = definition_table.loc[:,['sold_to_customer', 'sales_person', 'company_code_n',
       'sold_to_customer_n', 'customer_name']]
sales_gp_last_month = sales_gp_last_month.loc[:, ['fiscal_year', 'year_month', 'company_code_n', 'sales_person_n', 'sold_to_customer','sales_order_so', 
       'Sales (EURO)', 'GP (Euro)']]
temp = temp.loc[:, ['sold_to_customer', 'customer_name', 'temp']]

In [254]:
# Reduce the memory usage of the dataframe and improve performance
def check_unique_values(df):
    changed_columns = []
    for col in df.columns:
        unique_values = df[col].nunique()
        if unique_values < 50:
            df[col] = df[col].astype('category')
            changed_columns.append(col)
    if len(changed_columns) > 0:
        print("The following columns were changed to categorical data type: ")
        for col in changed_columns:
            print(col)
    else:
        print("No columns were changed to categorical data type.")

check_unique_values(sales_gp_report)
check_unique_values(definition_table)
check_unique_values(sales_gp_last_month)
check_unique_values(temp)

The following columns were changed to categorical data type: 
fiscal_year
year_month
company_code_n
The following columns were changed to categorical data type: 
sales_person
company_code_n
The following columns were changed to categorical data type: 
fiscal_year
year_month
company_code_n
The following columns were changed to categorical data type: 
sold_to_customer
customer_name
temp


In [255]:
print(f'{len(sales_gp_report)} + {len(sales_gp_last_month)} = {len(sales_gp_report) + len(sales_gp_last_month)}')
sales_gp_report_full = pd.concat([sales_gp_report, sales_gp_last_month])
print(len(sales_gp_report_full))

250757 + 6329 = 257086
257086


In [256]:
# Count the number of rows with missing values
dropped_rows = len(sales_gp_report_full) - len(sales_gp_report_full.dropna(subset=['sold_to_customer']))
# Drop rows with missing values in the 'sold_to_customer' column
sales_gp_report_full = sales_gp_report_full.dropna(subset=['sold_to_customer'])
print(dropped_rows)

13291


In [257]:
# Merge sales_gp_report_full and definition_table on 'sold_to_customer'
sales_gp_report_full_customers = sales_gp_report_full.merge(definition_table[['sold_to_customer', 'customer_name']], on='sold_to_customer', how='left')



In [258]:
# Count the number of rows where 'customer_name' is missing
num_missing = sales_gp_report_full_customers['customer_name'].isna().sum()
num_missing

202

In [259]:
# Merge sales_gp_report_full_customers and definition_table on matching columns
result = sales_gp_report_full_customers.merge(definition_table[['sold_to_customer_n', 'customer_name']], left_on='sales_person_n', right_on='sold_to_customer_n', how='left')

In [261]:
final_result = result.merge(temp[['sold_to_customer', 'temp']], on='sold_to_customer', how='left')
final_result['temp'] = final_result['temp'].astype('object')

In [265]:
definition_table_types = working_dfs[1]
definition_table_types = definition_table_types.loc[:,['customer_name', 'indirect_direct', 'channel', 'type']]
definition_table_types = definition_table_types.drop_duplicates(subset=['customer_name'])

In [267]:
final_result['combined'] = final_result['temp'].fillna(final_result['customer_name_y']).fillna(final_result['customer_name_x'])

In [268]:
final_result_with_types = final_result.merge(definition_table_types, left_on='combined', right_on='customer_name', how='left')

In [269]:
writer = pd.ExcelWriter('data_files/result_sent.xlsx', engine='xlsxwriter')
final_result_with_types.to_excel(writer, sheet_name='Sheet1')
writer.save()