# Setup

In [1]:
# Import Modules

import pandas as pd
import sqlalchemy
import sql_functions as sf

In [2]:
# Define global variables

sql_config = sf.get_sql_config() # Function loads credentials from a .env file and returns a dictionary with credentials
engine = sqlalchemy.create_engine('postgresql://user:pass@host/database', # Creates a connection object called engine
                                  connect_args=sql_config)
schema = 'capstone_kueblbeck' # Schema in our Postgresql database

In [3]:
# Other settings

pd.options.display.max_columns = 30

# Loading Dataframes

In [4]:
sql_query = f'select * from {schema}.lagerbestand'
df_lagerbestand = sf.get_dataframe(sql_query)
df_lagerbestand.head()

Unnamed: 0,Lfnr,Artnr,Index,Beschr.,BKZ,VPE,St.gr.,Ltz. VK ges.,Basispreis,Basispr. Summe,Gesamt,WEN,Ltz. VK WEN,RGB,Ltz. VK RGB,AMB,Ltz. VK AMB,CHA,Ltz. VK CHA,STR,Ltz. VK STR,PAS,Ltz. VK PAS,LAN,Ltz. VK LAN,MÜH,Ltz. VK MÜH,ROS,Ltz. VK ROS
0,Summe,Summe,Summe,Summe,Summe,Summe,Summe,03.06.2023,"22.137.471,28","9.973.936,32",1.084.309,763.512,03.06.2023,61.066,02.06.2023,31.034,02.06.2023,24.471,02.06.2023,55.651,02.06.2023,39.311,02.06.2023,31.628,02.06.2023,37.164,02.06.2023,40.366,02.06.2023
1,430,08.607.83,000,"N CORSA D 1,0L 44KW BJ 2006",8,1,L20,07.11.2018,7536,000,0,0.0,-,0.0,07.11.2018,0.0,08.08.2017,0.0,29.04.2016,0.0,14.12.2017,0.0,-,0.0,31.08.2017,0.0,-,0.0,26.03.2014
2,430,08.607.81,000,"M CORSA D 1,4L 66KW BJ 2006",8,1,L20,04.08.2021,9363,000,0,0.0,04.08.2021,0.0,10.07.2017,0.0,26.04.2021,0.0,28.04.2021,0.0,13.09.2018,0.0,31.07.2019,0.0,24.06.2016,0.0,-,0.0,04.05.2015
3,430,08.607.85,000,"N CORSA D 1,4L 66KW BJ 2006",8,1,L20,14.07.2021,7695,000,0,0.0,14.07.2021,0.0,-,0.0,26.04.2021,0.0,-,0.0,29.09.2020,0.0,31.07.2019,0.0,07.11.2018,0.0,-,0.0,-
4,430,08.607.80,000,"M CORSA D 1,0L 44KW BJ 2006",8,1,L20,14.12.2017,8007,000,0,0.0,24.09.2013,0.0,-,0.0,15.05.2015,0.0,29.04.2016,0.0,14.12.2017,0.0,-,0.0,-,0.0,-,0.0,19.10.2015


In [5]:
sql_query = f'select * from {schema}.lieferanten'
df_lieferanten = sf.get_dataframe(sql_query)
df_lieferanten.head()

Unnamed: 0,Lfnr,Beschreibung
0,,
1,0.0,BOSCH
2,1.0,HELLA
3,2.0,BOSCH-TELECOM
4,3.0,BASF KÜHLERFROSTSCHUTZ


In [6]:
sql_query = f'select * from {schema}.verkäufe'
df_verkaeufe = sf.get_dataframe(sql_query)
df_verkaeufe.head()

Unnamed: 0,Lfr.,Art.nr.,Ind.,Beschreibung,Gesamt,WEN,RGB,STR,PAS,AMB,CHA,LAN,MÜH,ROS
0,,,,,1.735.669,1.031.753,91.482,140.461,88.082,94.059,40.096,91.864,83.465,74.405
1,0.0,1928498680.0,0.0,BUCHSENKONTAKT,38.400,38.400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,52.0,50266496.0,0.0,"KABELBAND 300X4,8MM SCHWARZ",28.200,25.000,0.0,0.0,0.0,0.0,1.4,0.0,1.5,300.0
3,0.0,1928405459.0,0.0,BLINDKONTAKT,24.000,24.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,52.0,50266493.0,0.0,"KABELBAND 390X4,8MM SCHWARZ",27.722,22.900,0.0,0.0,0.0,210.0,702.0,1.0,1.6,1.31


# Data Cleaning

## df_lagerbestand

In [7]:
display(df_lagerbestand.head())
display(df_lagerbestand.info())

Unnamed: 0,Lfnr,Artnr,Index,Beschr.,BKZ,VPE,St.gr.,Ltz. VK ges.,Basispreis,Basispr. Summe,Gesamt,WEN,Ltz. VK WEN,RGB,Ltz. VK RGB,AMB,Ltz. VK AMB,CHA,Ltz. VK CHA,STR,Ltz. VK STR,PAS,Ltz. VK PAS,LAN,Ltz. VK LAN,MÜH,Ltz. VK MÜH,ROS,Ltz. VK ROS
0,Summe,Summe,Summe,Summe,Summe,Summe,Summe,03.06.2023,"22.137.471,28","9.973.936,32",1.084.309,763.512,03.06.2023,61.066,02.06.2023,31.034,02.06.2023,24.471,02.06.2023,55.651,02.06.2023,39.311,02.06.2023,31.628,02.06.2023,37.164,02.06.2023,40.366,02.06.2023
1,430,08.607.83,000,"N CORSA D 1,0L 44KW BJ 2006",8,1,L20,07.11.2018,7536,000,0,0.0,-,0.0,07.11.2018,0.0,08.08.2017,0.0,29.04.2016,0.0,14.12.2017,0.0,-,0.0,31.08.2017,0.0,-,0.0,26.03.2014
2,430,08.607.81,000,"M CORSA D 1,4L 66KW BJ 2006",8,1,L20,04.08.2021,9363,000,0,0.0,04.08.2021,0.0,10.07.2017,0.0,26.04.2021,0.0,28.04.2021,0.0,13.09.2018,0.0,31.07.2019,0.0,24.06.2016,0.0,-,0.0,04.05.2015
3,430,08.607.85,000,"N CORSA D 1,4L 66KW BJ 2006",8,1,L20,14.07.2021,7695,000,0,0.0,14.07.2021,0.0,-,0.0,26.04.2021,0.0,-,0.0,29.09.2020,0.0,31.07.2019,0.0,07.11.2018,0.0,-,0.0,-
4,430,08.607.80,000,"M CORSA D 1,0L 44KW BJ 2006",8,1,L20,14.12.2017,8007,000,0,0.0,24.09.2013,0.0,-,0.0,15.05.2015,0.0,29.04.2016,0.0,14.12.2017,0.0,-,0.0,-,0.0,-,0.0,19.10.2015


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357830 entries, 0 to 357829
Data columns (total 29 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Lfnr            357830 non-null  object 
 1   Artnr           357830 non-null  object 
 2   Index           357830 non-null  object 
 3   Beschr.         357830 non-null  object 
 4   BKZ             357830 non-null  object 
 5   VPE             357830 non-null  object 
 6   St.gr.          357830 non-null  object 
 7   Ltz. VK ges.    357830 non-null  object 
 8   Basispreis      357830 non-null  object 
 9   Basispr. Summe  357830 non-null  object 
 10  Gesamt          357830 non-null  object 
 11  WEN             357830 non-null  float64
 12  Ltz. VK WEN     357830 non-null  object 
 13  RGB             357830 non-null  float64
 14  Ltz. VK RGB     357830 non-null  object 
 15  AMB             357830 non-null  float64
 16  Ltz. VK AMB     357830 non-null  object 
 17  CHA       

None

In [8]:
# Adjust column names

df_lagerbestand.columns = df_lagerbestand.columns.str.lower()
df_lagerbestand.columns = [col.replace(" ", "_") for col in df_lagerbestand.columns.tolist()]
df_lagerbestand.columns = [col.replace(".", "") for col in df_lagerbestand.columns.tolist()]
df_lagerbestand.head()

Unnamed: 0,lfnr,artnr,index,beschr,bkz,vpe,stgr,ltz_vk_ges,basispreis,basispr_summe,gesamt,wen,ltz_vk_wen,rgb,ltz_vk_rgb,amb,ltz_vk_amb,cha,ltz_vk_cha,str,ltz_vk_str,pas,ltz_vk_pas,lan,ltz_vk_lan,müh,ltz_vk_müh,ros,ltz_vk_ros
0,Summe,Summe,Summe,Summe,Summe,Summe,Summe,03.06.2023,"22.137.471,28","9.973.936,32",1.084.309,763.512,03.06.2023,61.066,02.06.2023,31.034,02.06.2023,24.471,02.06.2023,55.651,02.06.2023,39.311,02.06.2023,31.628,02.06.2023,37.164,02.06.2023,40.366,02.06.2023
1,430,08.607.83,000,"N CORSA D 1,0L 44KW BJ 2006",8,1,L20,07.11.2018,7536,000,0,0.0,-,0.0,07.11.2018,0.0,08.08.2017,0.0,29.04.2016,0.0,14.12.2017,0.0,-,0.0,31.08.2017,0.0,-,0.0,26.03.2014
2,430,08.607.81,000,"M CORSA D 1,4L 66KW BJ 2006",8,1,L20,04.08.2021,9363,000,0,0.0,04.08.2021,0.0,10.07.2017,0.0,26.04.2021,0.0,28.04.2021,0.0,13.09.2018,0.0,31.07.2019,0.0,24.06.2016,0.0,-,0.0,04.05.2015
3,430,08.607.85,000,"N CORSA D 1,4L 66KW BJ 2006",8,1,L20,14.07.2021,7695,000,0,0.0,14.07.2021,0.0,-,0.0,26.04.2021,0.0,-,0.0,29.09.2020,0.0,31.07.2019,0.0,07.11.2018,0.0,-,0.0,-
4,430,08.607.80,000,"M CORSA D 1,0L 44KW BJ 2006",8,1,L20,14.12.2017,8007,000,0,0.0,24.09.2013,0.0,-,0.0,15.05.2015,0.0,29.04.2016,0.0,14.12.2017,0.0,-,0.0,-,0.0,-,0.0,19.10.2015


In [17]:
# Change date columns from string to date type
date_columns = ["ltz_vk_ges", "ltz_vk_wen", "ltz_vk_rgb", "ltz_vk_amb", "ltz_vk_cha", "ltz_vk_str", "ltz_vk_pas", "ltz_vk_lan", "ltz_vk_müh", "ltz_vk_ros"]

for column in date_columns:
        df_lagerbestand[column] = pd.to_datetime(df_lagerbestand[column], format='%d.%m.%Y', errors='coerce')

df_lagerbestand.head()


Unnamed: 0,lfnr,artnr,index,beschr,bkz,vpe,stgr,ltz_vk_ges,basispreis,basispr_summe,gesamt,wen,ltz_vk_wen,rgb,ltz_vk_rgb,amb,ltz_vk_amb,cha,ltz_vk_cha,str,ltz_vk_str,pas,ltz_vk_pas,lan,ltz_vk_lan,müh,ltz_vk_müh,ros,ltz_vk_ros
0,,Summe,Summe,Summe,Summe,,Summe,2023-06-03,22137471.28,9973936.32,1084309,763.512,2023-06-03,61.066,2023-06-02,31.034,2023-06-02,24.471,2023-06-02,55.651,2023-06-02,39.311,2023-06-02,31.628,2023-06-02,37.164,2023-06-02,40.366,2023-06-02
1,430.0,08.607.83,000,"N CORSA D 1,0L 44KW BJ 2006",8,1.0,L20,2018-11-07,75.36,0.0,0,0.0,NaT,0.0,2018-11-07,0.0,2017-08-08,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,2017-08-31,0.0,NaT,0.0,2014-03-26
2,430.0,08.607.81,000,"M CORSA D 1,4L 66KW BJ 2006",8,1.0,L20,2021-08-04,93.63,0.0,0,0.0,2021-08-04,0.0,2017-07-10,0.0,2021-04-26,0.0,2021-04-28,0.0,2018-09-13,0.0,2019-07-31,0.0,2016-06-24,0.0,NaT,0.0,2015-05-04
3,430.0,08.607.85,000,"N CORSA D 1,4L 66KW BJ 2006",8,1.0,L20,2021-07-14,76.95,0.0,0,0.0,2021-07-14,0.0,NaT,0.0,2021-04-26,0.0,NaT,0.0,2020-09-29,0.0,2019-07-31,0.0,2018-11-07,0.0,NaT,0.0,NaT
4,430.0,08.607.80,000,"M CORSA D 1,0L 44KW BJ 2006",8,1.0,L20,2017-12-14,80.07,0.0,0,0.0,2013-09-24,0.0,NaT,0.0,2015-05-15,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,NaT,0.0,NaT,0.0,2015-10-19


In [10]:
# Change selected number columns from string to float type
float_columns = ["lfnr", "vpe", "basispreis", "basispr_summe", "gesamt"]

for column in float_columns:
    df_lagerbestand[column] = pd.to_numeric(df_lagerbestand[column].str.replace('.', '').str.replace(',', '.'), errors='coerce')

  df_lagerbestand[column] = pd.to_numeric(df_lagerbestand[column].str.replace('.', '').str.replace(',', '.'), errors='coerce')


In [11]:
df_lagerbestand.describe()

Unnamed: 0,lfnr,vpe,basispreis,basispr_summe,gesamt,wen,rgb,amb,cha,str,pas,lan,müh,ros
count,357829.0,357829.0,357830.0,357830.0,357830.0,357830.0,357830.0,357830.0,357830.0,357830.0,357830.0,357830.0,357830.0,357830.0
mean,130.693365,3.200694,123.7318,55.74679,25.04381,1.602232,0.169459,0.093541,0.06852,0.155905,0.113208,0.085663,0.104041,0.121416
std,126.750995,27.255047,37007.66,16676.9,1824.365,17.446878,4.223074,2.732699,1.636756,3.775675,3.138968,2.248246,3.94829,3.925887
min,0.0,0.0,0.0,-68931.0,-8399.0,-862.0,-855.0,-366.0,-50.0,-859.0,-514.0,-22.0,-965.0,-435.0
25%,25.0,0.0,8.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,110.0,1.0,26.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,182.0,1.0,64.0975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,990.0,5000.0,22137470.0,9973936.0,1084309.0,999.0,780.0,632.0,288.0,775.0,602.0,587.0,756.0,638.0


## df_lieferanten

In [12]:
# Adjust column names

df_lieferanten.columns = df_lieferanten.columns.str.lower()
df_lieferanten.columns = [col.replace(" ", "_") for col in df_lieferanten.columns.tolist()]
df_lieferanten.columns = [col.replace(".", "") for col in df_lieferanten.columns.tolist()]
df_lieferanten.head()

Unnamed: 0,lfnr,beschreibung
0,,
1,0.0,BOSCH
2,1.0,HELLA
3,2.0,BOSCH-TELECOM
4,3.0,BASF KÜHLERFROSTSCHUTZ


In [13]:
df_lieferanten.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 379 entries, 0 to 378
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lfnr          378 non-null    float64
 1   beschreibung  376 non-null    object 
dtypes: float64(1), object(1)
memory usage: 6.0+ KB


## df_verkaeufe

In [14]:
# Adjust column names

df_verkaeufe.columns = df_verkaeufe.columns.str.lower()
df_verkaeufe.columns = [col.replace(" ", "_") for col in df_verkaeufe.columns.tolist()]
df_verkaeufe.columns = [col.replace(".", "") for col in df_verkaeufe.columns.tolist()]
df_verkaeufe.head()

Unnamed: 0,lfr,artnr,ind,beschreibung,gesamt,wen,rgb,str,pas,amb,cha,lan,müh,ros
0,,,,,1.735.669,1.031.753,91.482,140.461,88.082,94.059,40.096,91.864,83.465,74.405
1,0.0,1928498680.0,0.0,BUCHSENKONTAKT,38.400,38.400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,52.0,50266496.0,0.0,"KABELBAND 300X4,8MM SCHWARZ",28.200,25.000,0.0,0.0,0.0,0.0,1.4,0.0,1.5,300.0
3,0.0,1928405459.0,0.0,BLINDKONTAKT,24.000,24.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,52.0,50266493.0,0.0,"KABELBAND 390X4,8MM SCHWARZ",27.722,22.900,0.0,0.0,0.0,210.0,702.0,1.0,1.6,1.31


In [15]:
# Change selected number columns from string to float type
float_columns = ["gesamt", "wen"]

for column in float_columns:
    df_verkaeufe[column] = pd.to_numeric(df_verkaeufe[column].str.replace('.', '').str.replace(',', '.'), errors='coerce')

  df_verkaeufe[column] = pd.to_numeric(df_verkaeufe[column].str.replace('.', '').str.replace(',', '.'), errors='coerce')


In [16]:
df_verkaeufe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66010 entries, 0 to 66009
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lfr           66009 non-null  float64
 1   artnr         66009 non-null  object 
 2   ind           66009 non-null  float64
 3   beschreibung  66009 non-null  object 
 4   gesamt        66010 non-null  int64  
 5   wen           66010 non-null  int64  
 6   rgb           66010 non-null  float64
 7   str           66010 non-null  float64
 8   pas           66010 non-null  float64
 9   amb           66010 non-null  float64
 10  cha           66010 non-null  float64
 11  lan           66010 non-null  float64
 12  müh           66010 non-null  float64
 13  ros           66010 non-null  float64
dtypes: float64(10), int64(2), object(2)
memory usage: 7.1+ MB


In [20]:
df_verkaeufe.head()

Unnamed: 0,lfr,artnr,ind,beschreibung,gesamt,wen,rgb,str,pas,amb,cha,lan,müh,ros
0,,,,,1735669,1031753,91.482,140.461,88.082,94.059,40.096,91.864,83.465,74.405
1,0.0,1928498680.0,0.0,BUCHSENKONTAKT,38400,38400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,52.0,50266496.0,0.0,"KABELBAND 300X4,8MM SCHWARZ",28200,25000,0.0,0.0,0.0,0.0,1.4,0.0,1.5,300.0
3,0.0,1928405459.0,0.0,BLINDKONTAKT,24000,24000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,52.0,50266493.0,0.0,"KABELBAND 390X4,8MM SCHWARZ",27722,22900,0.0,0.0,0.0,210.0,702.0,1.0,1.6,1.31
