# Introduction
In this notebook we present our code for our capstone project in an easy to read and understandable way.

* ___Say something about the project___

* ___Say something about the notebook structure___

* ___Any additional notes?___

# Setup

* ___Optional: Add subtext to every header___

## Basic setup

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import sqlalchemy
import psycopg2 # needed to get database exception errors when uploading dataframe
import matplotlib.pyplot as plt # needed for visualizations
import seaborn as sns # data visualization library based on matplotlib
import os # provides functions for interacting with the operating system
import math # provides access to the mathematical functions
import sql_functions as sf # own collection of functions to manage connections to our SQL database
import re
import math
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Define variables
schema = 'capstone_kueblbeck'
engine = sf.get_engine()

In [None]:
# Other settings
pd.options.display.max_columns = 40
pd.options.display.float_format = "{:,.2f}".format

## Prepare data and export to SQL

### Import data from external files

In [None]:
# Import "Lagerbestand"
lagerbestand = pd.read_csv('./data/Lagerbestand_02.Juni.2023.txt', decimal=',')
lagerbestand = lagerbestand.drop(lagerbestand.index[0])

In [None]:
# Import "Lieferanten"
lieferanten = pd.read_excel('./data/Lieferantenübersicht.xlsx')
lieferanten = lieferanten.drop(lieferanten.index[0])

In [None]:
# Import "Verkäufe"
verkaeufe = pd.read_csv('./data/Abverkauf_DE_2022_8410.csv', decimal=',')
verkaeufe = verkaeufe.drop(verkaeufe.index[0])

### Correct datatypes for each table (if necessary)

#### Lagerbestand

In [None]:
# Change date columns from string to date type
date_columns = ["Ltz. VK ges.", "Ltz. VK WEN", "Ltz. VK RGB", "Ltz. VK AMB", "Ltz. VK CHA", "Ltz. VK STR", "Ltz. VK PAS", "Ltz. VK LAN", "Ltz. VK MÜH", "Ltz. VK ROS"]

for column in date_columns:
        lagerbestand[column] = pd.to_datetime(lagerbestand[column], format='%d.%m.%Y', errors='coerce')

In [None]:
# Change selected number columns from string to float type
numeric_columns = ['Gesamt', 'WEN', 'RGB', 'AMB', 'CHA', 'STR', 'PAS', 'LAN', 'MÜH', 'ROS']

for column in numeric_columns:
    lagerbestand[column] = pd.to_numeric(lagerbestand[column].str.replace('.', '').str.replace(',','.'), errors='coerce')

In [None]:
# Special treatment for columns 'Basispreis' and 'Basispr. Summe'
numeric_columns_basis = ['Basispreis', 'Basispr. Summe']

for column in numeric_columns_basis:
    lagerbestand[column] = pd.to_numeric(lagerbestand[column].str.replace('.', '').str.replace(',','.'))

In [None]:
# Change 'Lfnr' and 'VPE' to Integer type, as these are categorical numbers
lagerbestand['Lfnr'] = lagerbestand['Lfnr'].astype(int)
lagerbestand['VPE'] = lagerbestand['VPE'].astype(int)

#### Lieferanten

In [None]:
# Sync datatype in column 'Lfnr' to table 'Lagerbestand'
lieferanten['Lfnr'] = lieferanten['Lfnr'].astype(int)

#### Verkaeufe

In [None]:
# Change selected number columns from string to float type
numeric_columns = ["Gesamt", "WEN", "RGB", "AMB", "CHA", "STR", "PAS", "LAN", "MÜH", "ROS"]

for column in numeric_columns:
    verkaeufe[column] = pd.to_numeric(verkaeufe[column].str.replace('.', ''), errors='coerce')

In [None]:
# Sync datatype in column 'Lfr.' to table 'Lagerbestand'
verkaeufe['Lfr.'] = verkaeufe['Lfr.'].astype(int)

#### Check Dataframes before Export

In [None]:
display(lagerbestand.head(2))
display(lagerbestand.info())

In [None]:
display(lieferanten.head(2))
display(lieferanten.info())

In [None]:
display(verkaeufe.head(2))
display(verkaeufe.info())

### Export to SQL

Note: The following code blocks will be fully commented out as the dataframes are already uploaded. Running these codes would therefore cause an error due to restrictions on our SQL server.

In [None]:
# # Import Lagerbestand data into SQL 
# table_name = 'lagerbestand'

# # Write records stored in a dataframe to SQL database
# if engine!=None:
#     try:
#         lagerbestand.to_sql(name=table_name, # Name of SQL table
#                         con=engine, # Engine or connection
#                         if_exists='replace', # Drop the table before inserting new values 
#                         schema=schema, # Use schema that was defined earlier
#                         index=False, # Write DataFrame index as a column
#                         chunksize=5000, # Specify the number of rows in each batch to be written at a time
#                         method='multi') # Pass multiple values in a single INSERT clause
#         print(f"The {table_name} table was imported successfully.")
#     # Error handling
#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None

In [None]:
# # Import 'Lieferanten' data into SQL 
# table_name = 'lieferanten'

# # Write records stored in a dataframe to SQL database
# if engine!=None:
#     try:
#         lieferanten.to_sql(name=table_name, # Name of SQL table
#                         con=engine, # Engine or connection
#                         if_exists='replace', # Drop the table before inserting new values 
#                         schema=schema, # Use schema that was defined earlier
#                         index=False, # Write DataFrame index as a column
#                         chunksize=5000, # Specify the number of rows in each batch to be written at a time
#                         method='multi') # Pass multiple values in a single INSERT clause
#         print(f"The {table_name} table was imported successfully.")
#     # Error handling
#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None

In [None]:
# # Import 'Verkaeufe' data into SQL 
# table_name = 'verkäufe'

# # Write records stored in a dataframe to SQL database
# if engine!=None:
#     try:
#         verkaeufe.to_sql(name=table_name, # Name of SQL table
#                         con=engine, # Engine or connection
#                         if_exists='replace', # Drop the table before inserting new values 
#                         schema=schema, # Use schema that was defined earlier
#                         index=False, # Write DataFrame index as a column
#                         chunksize=5000, # Specify the number of rows in each batch to be written at a time
#                         method='multi') # Pass multiple values in a single INSERT clause
#         print(f"The {table_name} table was imported successfully.")
#     # Error handling
#     except (Exception, psycopg2.DatabaseError) as error:
#         print(error)
#         engine = None

# Data Cleaning

## Loading dataframes

In [None]:
# Lagerbestand
sql_query = f'select * from {schema}.lagerbestand'
df_lagerbestand = sf.get_dataframe(sql_query)

In [None]:
# Lieferanten
sql_query = f'select * from {schema}.lieferanten'
df_lieferanten = sf.get_dataframe(sql_query)

In [None]:
# Verkaeufe
sql_query = f'select * from {schema}.verkäufe'
df_verkaeufe = sf.get_dataframe(sql_query)

## df_lagerbestand

In [None]:
# Adjust column names
df_lagerbestand.columns = df_lagerbestand.columns.str.lower()
df_lagerbestand.columns = [col.replace(" ", "_") for col in df_lagerbestand.columns.tolist()]
df_lagerbestand.columns = [col.replace(".", "") for col in df_lagerbestand.columns.tolist()]

In [None]:
# Change names of selected columns for clearer identification
new_columns = {'beschr':'beschreibung',
               'bkz':'bestellkennzeichen',
               'vpe':'verp_einheit',
               'stgr':'stat_gruppe',
               'gesamt':'gesamt_lager',
               'wen':'wen_lager',
               'rgb':'rgb_lager',
               'str':'str_lager',
               'pas':'pas_lager',
               'amb':'amb_lager',
               'cha':'cha_lager',
               'lan':'lan_lager',
               'müh':'müh_lager',
               'ros':'ros_lager'}

df_lagerbestand = df_lagerbestand.rename(columns=new_columns)

In [None]:
# Additional changing of datatypes for index column (Integer due to it being categorical)
df_lagerbestand['index'] = df_lagerbestand['index'].astype(int)

## df_lieferanten

In [None]:
# Adjust column names
df_lieferanten.columns = df_lieferanten.columns.str.lower()
df_lieferanten.columns = [col.replace(" ", "_") for col in df_lieferanten.columns.tolist()]
df_lieferanten.columns = [col.replace(".", "") for col in df_lieferanten.columns.tolist()]

In [None]:
# Change names of selected columns for clearer identification
df_lieferanten = df_lieferanten.rename(columns={'beschreibung':'lieferant'})

## df_verkaeufe

In [None]:
# Adjust column names
df_verkaeufe.columns = df_verkaeufe.columns.str.lower()
df_verkaeufe.columns = [col.replace(" ", "_") for col in df_verkaeufe.columns.tolist()]
df_verkaeufe.columns = [col.replace(".", "") for col in df_verkaeufe.columns.tolist()]

In [None]:
# Change names of selected columns for clearer identification
new_columns = {'lfr':'lfnr',
               'ind': 'index',
               'gesamt':'gesamt_vk',
               'wen':'wen_vk',
               'rgb':'rgb_vk',
               'str':'str_vk',
               'pas':'pas_vk',
               'amb':'amb_vk',
               'cha':'cha_vk',
               'lan':'lan_vk',
               'müh':'müh_vk',
               'ros':'ros_vk'}

df_verkaeufe = df_verkaeufe.rename(columns=new_columns)

In [None]:
# Additional changing of datatypes for index column (Integer due to it being categorical)
df_verkaeufe['index'] = df_verkaeufe['index'].astype(int)

## Check dataframes

In [None]:
display(df_lagerbestand.head(2))
display(df_lagerbestand.info())

In [None]:
display(df_lieferanten.head(2))
display(df_lieferanten.info())

In [None]:
display(df_verkaeufe.head(2))
display(df_verkaeufe.info())

# Merging tables to df_master

In [None]:
# Merging df_lagerbestand and df_lieferanten
df_master = df_lagerbestand.merge(df_lieferanten, how='left', on='lfnr')

In [None]:
# Merging df_verkaeufe on df_master
df_master = df_master.merge(df_verkaeufe, how='left', on=['lfnr', 'artnr', 'index', 'beschreibung'])

In [None]:
# Adjusting column positions
new_column_order = ['lfnr','lieferant', 'artnr', 'beschreibung', 'index', 'bestellkennzeichen',
       'verp_einheit', 'stat_gruppe', 'ltz_vk_ges', 'basispreis',
       'basispr_summe', 'gesamt_lager', 'wen_lager', 'ltz_vk_wen', 'rgb_lager',
       'ltz_vk_rgb', 'amb_lager', 'ltz_vk_amb', 'cha_lager', 'ltz_vk_cha',
       'str_lager', 'ltz_vk_str', 'pas_lager', 'ltz_vk_pas', 'lan_lager',
       'ltz_vk_lan', 'müh_lager', 'ltz_vk_müh', 'ros_lager', 'ltz_vk_ros',
        'gesamt_vk', 'wen_vk', 'rgb_vk', 'str_vk', 'pas_vk',
       'amb_vk', 'cha_vk', 'lan_vk', 'müh_vk', 'ros_vk']

df_master = df_master.reindex(columns = new_column_order)

## Check df_master

In [None]:
display(df_master.head(2))
display(df_master.info())

# Creating visualizations

## Visualize quality stock

In [None]:
# Create a bar chart with subplots to display overall and subsidiary's number of articles in stock categorized into number of sales

locations = {'gesamt': 'Gesamt', 
             'wen': 'Weiden', 
             'rgb': 'Regensburg', 
             'amb': 'Amberg', 
             'cha': 'Cham', 
             'str': 'Straubing', 
             'pas': 'Passau', 
             'lan': 'Landshut', 
             'müh': 'Mühldorf', 
             'ros': 'Rosenheim'}

PE_categories = ['In stock, 4+ sales', 'In stock, 3 sales', 'In stock, 2 sales', 'In stock, 1 sale', 'In stock, 0 sales']
display_order_quality = PE_categories

for x in locations.keys():
    PE_condition = [
        (df_master[x+'_lager'] > 0) & (df_master[x+'_vk'] > 3),
        (df_master[x+'_lager'] > 0) & (df_master[x+'_vk'] == 3),
        (df_master[x+'_lager'] > 0) & (df_master[x+'_vk'] == 2),
        (df_master[x+'_lager'] > 0) & (df_master[x+'_vk'] == 1),
        (df_master[x+'_lager'] > 0) & (df_master[x+'_vk'] == 0)   
    ]

    df_master[x+'_quality'] = np.select(PE_condition, PE_categories)

fig, axes = plt.subplots(4, 3, figsize=(25,20))
fig.suptitle('Warehouse management quality stock', fontweight='bold', fontsize=30)
fig.tight_layout(pad=5.0)

for i in range (13):
    y = 0
    z = 1

    for key, value in locations.items():

            
        location = df_master.query(key + '_quality != "0"')

        sub = sns.countplot(ax=axes[y, z], x=key + '_quality', data=location, order=display_order_quality)
        axes[y, z].set_title("Qualität " + value, fontsize=15.0)
        axes[y, z].set_xlabel('Qualität', fontsize=10.0)
        axes[y, z].set_ylabel('Anzahl', fontsize=10.0)
        # Erhalten Sie die Gesamtzahl der Qualitätsspalte
        total = location[key + '_quality'].count()

        freq_series = location[key + '_quality'].value_counts()
        freq_series = freq_series.reindex(display_order_quality)

        rects = sub.patches
        labels = [f'{(x/total)*100:.1f}%' for x in freq_series]
        for rect, label in zip(rects, labels):
            height = rect.get_height()
            axes[y, z].text(rect.get_x() + rect.get_width() / 2, height + 5, label,
                    ha='center', va='bottom')
            
        z += 1  
        if y == 0 and z == 2:
            y = 1
            z = 0
            
        elif z >= 3:
            y += 1
            z = 0


## Visualize quality sales

In [None]:
# Create a bar chart with subplots to display overall and subsidiary's number of articles categorized into number of sales and if in or out of stock 
locations = {'gesamt': 'Gesamt', 
             'wen': 'Weiden', 
             'rgb': 'Regensburg', 
             'amb': 'Amberg', 
             'cha': 'Cham', 
             'str': 'Straubing', 
             'pas': 'Passau', 
             'lan': 'Landshut', 
             'müh': 'Mühldorf', 
             'ros': 'Rosenheim'}

PE_categories = ['4+ sales, in stock', '4+ sales, no stock', '1-3 sales, in stock', '1-3 sales, no stock', '0 sales, in stock']
display_order_quality = PE_categories

for x in locations.keys():
    PE_condition = [
        (df_master[x+'_lager'] > 0) & (df_master[x+'_vk'] > 3),
        (df_master[x+'_lager'] == 0) & (df_master[x+'_vk'] > 3),
        (df_master[x+'_lager'] > 0) & (df_master[x+'_vk'] < 3) & (df_master[x+'_vk'] > 0),
        (df_master[x+'_lager'] == 0) & (df_master[x+'_vk'] < 3) & (df_master[x+'_vk'] > 0),
        (df_master[x+'_lager'] > 0) & (df_master[x+'_vk'] == 0)
    ]

    df_master[x+'_quality'] = np.select(PE_condition, PE_categories)

fig, axes = plt.subplots(4, 3, figsize=(25,20))
fig.suptitle('Warehouse management quality sales', fontweight='bold', fontsize=30)
fig.tight_layout(pad=5.0)

for i in range (13):
    y = 0
    z = 1

    for key, value in locations.items():

            
        location = df_master.query(key + '_quality != "0"').reset_index()

        sub = sns.countplot(ax=axes[y, z], x=key + '_quality', data=location, order=display_order_quality)
        axes[y, z].set_title("Qualität " + value, fontsize=15.0)
        axes[y, z].set_xlabel('Qualität', fontsize=10.0)
        axes[y, z].set_ylabel('Anzahl', fontsize=10.0)
        # Erhalten Sie die Gesamtzahl der Qualitätsspalte
        total = location[key + '_quality'].count()

        freq_series = location[key + '_quality'].value_counts()
        freq_series = freq_series.reindex(display_order_quality)

        rects = sub.patches
        labels = [f'{(x/total)*100:.1f}%' for x in freq_series]
        for rect, label in zip(rects, labels):
            height = rect.get_height()
            axes[y, z].text(rect.get_x() + rect.get_width() / 2, height + 5, label,
                    ha='center', va='bottom')
            
        z += 1  
        if y == 0 and z == 2:
            y = 1
            z = 0
            
        elif z >= 3:
            y += 1
            z = 0


# Preparing Distribution Method

In [None]:
df_master_quality = df_master.query('gesamt_quality != "0"')

locations = {'wen': 'Weiden', 
             'rgb': 'Regensburg', 
             'amb': 'Amberg', 
             'cha': 'Cham', 
             'str': 'Straubing', 
             'pas': 'Passau', 
             'lan': 'Landshut', 
             'müh': 'Mühldorf', 
             'ros': 'Rosenheim'}

PE_categories = ['4+ sales, in stock', '4+ sales, no stock', '1-3 sales, in stock', '1-3 sales, no stock', '0 sales, in stock']
display_order_quality = PE_categories

for x in locations.keys():
    PE_condition = [
        (df_master[x+'_lager'] > 0) & (df_master[x+'_vk'] > 3),
        (df_master[x+'_lager'] == 0) & (df_master[x+'_vk'] > 3),
        (df_master[x+'_lager'] > 0) & (df_master[x+'_vk'] < 3) & (df_master[x+'_vk'] > 0),
        (df_master[x+'_lager'] == 0) & (df_master[x+'_vk'] < 3) & (df_master[x+'_vk'] > 0),
        (df_master[x+'_lager'] > 0) & (df_master[x+'_vk'] == 0)
    ]

    df_master[x+'_quality'] = np.select(PE_condition, PE_categories)

for key, value in locations.items():
    df_master_quality['take_from_' + key] = df_master_quality.apply(lambda row: ', '.join([k for k, v in locations.items() if row[k + '_quality'] == '4+ sales, no stock']) if row[key + '_quality'] in ['1-3 sales, in stock', '0 sales, in stock'] else '-', axis=1)

df_master_quality_final = df_master_quality[~(df_master_quality.filter(like='take_from_').isin(['-', ''])).all(axis=1)]

# Code for Distribution Method


In [None]:
df_master_quality_distribution = df_master_quality_final

def count_list_elements(x):
    if x == ['-']:
        return 0
    else:
        return len(x)
    

def assigning(row):
    a = row['list']
    b = row['dividing']
    c = row['remainder']
    d = row['best_sales']
    result = [f"{x} ({b + c:.0f})" if x == d else f"{x} ({b:.0f})" for x in a]
    return result

def best_sale(row):
    a = row['list']
    if a == ['-']:
        return '-'
    else:
        best_sales = sorted([row[k + '_vk'] for k in row['list'] if k != '-' and k + '_vk' in row.index and row[k + '_vk'] is not None], key=lambda y: float(y) if isinstance(y, str) else y, reverse=True)[0]
        for i in row['list']:
            if row[i + '_vk'] == best_sales:
                return i
            
def calculate_stock(row):
    total_stock = 0
    pattern = r'\(\d+\)'
    for key in locations.keys():
        if isinstance(row['take_from_' + key], str) and row['take_from_' + key] != '-':
            matches = re.findall(pattern, row['take_from_' + key])
            for match in matches:
                stock = int(match[1:-1])
                total_stock += stock
    #total_stock *= row['basispreis']
    return total_stock

def renaming(row, key):
    pattern = '|'.join(locations.keys())
    a = row['take_from_' + key]
    p = pattern
    res = re.sub(pattern, lambda match: locations[match.group(0)], a)
    return res

def formating(row, key):
    a = row['take_from_' + key]
    res_blank = a.replace(',', ',\n')
    #res_double = a.replace(') (', ',)""')
    return res_blank

for key, value in locations.items():
    df_master_quality_distribution['list'] = df_master_quality_distribution['take_from_' + key].apply(lambda x: [i for i in x.split(', ')])
    df_master_quality_distribution['numbers'] = df_master_quality_distribution['list'].apply(count_list_elements)
    df_master_quality_distribution['dividing'] = (df_master_quality_distribution[key +'_lager']/df_master_quality_distribution['numbers']).apply(np.floor)
    df_master_quality_distribution['remainder'] = (df_master_quality_distribution[key + '_lager']%df_master_quality_distribution['numbers'])
    df_master_quality_distribution['best_sales'] = df_master_quality_distribution.apply(best_sale, axis=1)
    df_master_quality_distribution['locations'] = df_master_quality_distribution.apply(assigning, axis=1)
    df_master_quality_distribution['locations'] = [','.join(map(str, l)) for l in df_master_quality_distribution['locations']]
    df_master_quality_distribution['locations'] = df_master_quality_distribution['locations'].replace(["- (nan)", "- (inf)", "- (-inf)"], "-")
    df_master_quality_distribution['take_from_' + key] = df_master_quality_distribution['locations']
    df_master_quality_distribution['take_from_' + key] = df_master_quality_distribution.apply(renaming, axis=1, key=key)
    df_master_quality_distribution['take_from_' + key] = df_master_quality_distribution.apply(formating, axis=1, key=key)
    df_master_quality_distribution['stock'] = df_master_quality_distribution.apply(calculate_stock, axis=1)

# Cleaning and Sorting

In [None]:
def keep_cols(DataFrame, keep_these):
    """Keep only the columns [keep_these] in a DataFrame, delete
    all other columns. 
    """
    drop_these = list(set(list(DataFrame)) - set(keep_these))
    return DataFrame.drop(drop_these, axis = 1)

take_from = ['lieferant', 'artnr', 'beschreibung']
for key in locations.keys():
    a = f'take_from_{key}'
    take_from.append(a)

df_master_quality_distribution.sort_values(by='stock', ascending=False, inplace=True)
df_master_quality_distribution.reset_index(inplace=True, drop=True)
df_master_quality_output = df_master_quality_distribution.pipe(keep_cols, take_from)

# Printing result

In [None]:
df_master_quality_output.style