# Setup

In [1]:
# Import libraries/modules

import pandas as pd
import sqlalchemy
import sql_functions as sf

In [2]:
# Define global variables

sql_config = sf.get_sql_config() # Function loads credentials from a .env file and returns a dictionary with credentials
engine = sqlalchemy.create_engine('postgresql://user:pass@host/database', # Creates a connection object called engine
                                  connect_args=sql_config)
schema = 'capstone_kueblbeck' # Schema in our Postgresql database

In [3]:
# Other settings

pd.options.display.max_columns = 30
pd.options.display.float_format = "{:,.2f}".format

# Loading Dataframes

In [4]:
sql_query = f'select * from {schema}.lagerbestand'
df_lagerbestand = sf.get_dataframe(sql_query)
df_lagerbestand.head()

Unnamed: 0,Lfnr,Artnr,Index,Beschr.,BKZ,VPE,St.gr.,Ltz. VK ges.,Basispreis,Basispr. Summe,Gesamt,WEN,Ltz. VK WEN,RGB,Ltz. VK RGB,AMB,Ltz. VK AMB,CHA,Ltz. VK CHA,STR,Ltz. VK STR,PAS,Ltz. VK PAS,LAN,Ltz. VK LAN,MÜH,Ltz. VK MÜH,ROS,Ltz. VK ROS
0,,Summe,Summe,Summe,Summe,,Summe,2023-06-03,,,1084309.0,763512.0,2023-06-03,61066.0,2023-06-02,31034.0,2023-06-02,24471.0,2023-06-02,55651.0,2023-06-02,39311.0,2023-06-02,31628.0,2023-06-02,37164.0,2023-06-02,40366.0,2023-06-02
1,430.0,08.607.83,000,"N CORSA D 1,0L 44KW BJ 2006",8,1.0,L20,2018-11-07,,,0.0,0.0,NaT,0.0,2018-11-07,0.0,2017-08-08,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,2017-08-31,0.0,NaT,0.0,2014-03-26
2,430.0,08.607.81,000,"M CORSA D 1,4L 66KW BJ 2006",8,1.0,L20,2021-08-04,,,0.0,0.0,2021-08-04,0.0,2017-07-10,0.0,2021-04-26,0.0,2021-04-28,0.0,2018-09-13,0.0,2019-07-31,0.0,2016-06-24,0.0,NaT,0.0,2015-05-04
3,430.0,08.607.85,000,"N CORSA D 1,4L 66KW BJ 2006",8,1.0,L20,2021-07-14,,,0.0,0.0,2021-07-14,0.0,NaT,0.0,2021-04-26,0.0,NaT,0.0,2020-09-29,0.0,2019-07-31,0.0,2018-11-07,0.0,NaT,0.0,NaT
4,430.0,08.607.80,000,"M CORSA D 1,0L 44KW BJ 2006",8,1.0,L20,2017-12-14,,,0.0,0.0,2013-09-24,0.0,NaT,0.0,2015-05-15,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,NaT,0.0,NaT,0.0,2015-10-19


In [5]:
sql_query = f'select * from {schema}.lieferanten'
df_lieferanten = sf.get_dataframe(sql_query)
df_lieferanten.head()

Unnamed: 0,Lfnr,Beschreibung
0,,
1,0.0,BOSCH
2,1.0,HELLA
3,2.0,BOSCH-TELECOM
4,3.0,BASF KÜHLERFROSTSCHUTZ


In [6]:
sql_query = f'select * from {schema}.verkäufe'
df_verkaeufe = sf.get_dataframe(sql_query)
df_verkaeufe.head()

Unnamed: 0,Lfr.,Art.nr.,Ind.,Beschreibung,Gesamt,WEN,RGB,STR,PAS,AMB,CHA,LAN,MÜH,ROS
0,,,,,1735669.0,1031753.0,91482.0,140461.0,88082.0,94059.0,40096.0,91864.0,83465.0,74405.0
1,0.0,1928498680.0,0.0,BUCHSENKONTAKT,38400.0,38400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,52.0,50266496.0,0.0,"KABELBAND 300X4,8MM SCHWARZ",28200.0,25000.0,0.0,0.0,0.0,0.0,1400.0,0.0,1500.0,300.0
3,0.0,1928405459.0,0.0,BLINDKONTAKT,24000.0,24000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,52.0,50266493.0,0.0,"KABELBAND 390X4,8MM SCHWARZ",27722.0,22900.0,0.0,0.0,0.0,210.0,702.0,1000.0,1600.0,1310.0


# Data Cleaning

## df_lagerbestand

In [7]:
# Adjust column names

df_lagerbestand.columns = df_lagerbestand.columns.str.lower()
df_lagerbestand.columns = [col.replace(" ", "_") for col in df_lagerbestand.columns.tolist()]
df_lagerbestand.columns = [col.replace(".", "") for col in df_lagerbestand.columns.tolist()]
df_lagerbestand.head()

Unnamed: 0,lfnr,artnr,index,beschr,bkz,vpe,stgr,ltz_vk_ges,basispreis,basispr_summe,gesamt,wen,ltz_vk_wen,rgb,ltz_vk_rgb,amb,ltz_vk_amb,cha,ltz_vk_cha,str,ltz_vk_str,pas,ltz_vk_pas,lan,ltz_vk_lan,müh,ltz_vk_müh,ros,ltz_vk_ros
0,,Summe,Summe,Summe,Summe,,Summe,2023-06-03,,,1084309.0,763512.0,2023-06-03,61066.0,2023-06-02,31034.0,2023-06-02,24471.0,2023-06-02,55651.0,2023-06-02,39311.0,2023-06-02,31628.0,2023-06-02,37164.0,2023-06-02,40366.0,2023-06-02
1,430.0,08.607.83,000,"N CORSA D 1,0L 44KW BJ 2006",8,1.0,L20,2018-11-07,,,0.0,0.0,NaT,0.0,2018-11-07,0.0,2017-08-08,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,2017-08-31,0.0,NaT,0.0,2014-03-26
2,430.0,08.607.81,000,"M CORSA D 1,4L 66KW BJ 2006",8,1.0,L20,2021-08-04,,,0.0,0.0,2021-08-04,0.0,2017-07-10,0.0,2021-04-26,0.0,2021-04-28,0.0,2018-09-13,0.0,2019-07-31,0.0,2016-06-24,0.0,NaT,0.0,2015-05-04
3,430.0,08.607.85,000,"N CORSA D 1,4L 66KW BJ 2006",8,1.0,L20,2021-07-14,,,0.0,0.0,2021-07-14,0.0,NaT,0.0,2021-04-26,0.0,NaT,0.0,2020-09-29,0.0,2019-07-31,0.0,2018-11-07,0.0,NaT,0.0,NaT
4,430.0,08.607.80,000,"M CORSA D 1,0L 44KW BJ 2006",8,1.0,L20,2017-12-14,,,0.0,0.0,2013-09-24,0.0,NaT,0.0,2015-05-15,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,NaT,0.0,NaT,0.0,2015-10-19


In [8]:
df_lagerbestand.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357830 entries, 0 to 357829
Data columns (total 29 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   lfnr           32767 non-null   float64       
 1   artnr          357830 non-null  object        
 2   index          357830 non-null  object        
 3   beschr         357830 non-null  object        
 4   bkz            357830 non-null  object        
 5   vpe            32767 non-null   float64       
 6   stgr           357830 non-null  object        
 7   ltz_vk_ges     338129 non-null  datetime64[ns]
 8   basispreis     0 non-null       object        
 9   basispr_summe  0 non-null       object        
 10  gesamt         325062 non-null  float64       
 11  wen            325062 non-null  float64       
 12  ltz_vk_wen     204993 non-null  datetime64[ns]
 13  rgb            98304 non-null   float64       
 14  ltz_vk_rgb     124823 non-null  datetime64[ns]
 15  

In [9]:
# # Change selected number columns from string to float type
# float_columns = ["lfnr", "vpe", "basispreis", "basispr_summe", "gesamt"]

# for column in float_columns:
#     df_lagerbestand[column] = pd.to_numeric(df_lagerbestand[column].str.replace('.', '').str.replace(',', '.'), errors='coerce')

In [10]:
# Change names of selected columns
new_columns = {'beschr':'beschreibung',
               'bkz':'bestellkennzeichen',
               'vpe':'verp_einheit',
               'stgr':'stat_gruppe',
               'gesamt':'gesamt_lager',
               'wen':'wen_lager',
               'rgb':'rgb_lager',
               'str':'str_lager',
               'pas':'pas_lager',
               'amb':'amb_lager',
               'cha':'cha_lager',
               'lan':'lan_lager',
               'müh':'müh_lager',
               'ros':'ros_lager'}

df_lagerbestand = df_lagerbestand.rename(columns=new_columns)

In [11]:
display(df_lagerbestand.head())
display(df_lagerbestand.info())

Unnamed: 0,lfnr,artnr,index,beschreibung,bestellkennzeichen,verp_einheit,stat_gruppe,ltz_vk_ges,basispreis,basispr_summe,gesamt_lager,wen_lager,ltz_vk_wen,rgb_lager,ltz_vk_rgb,amb_lager,ltz_vk_amb,cha_lager,ltz_vk_cha,str_lager,ltz_vk_str,pas_lager,ltz_vk_pas,lan_lager,ltz_vk_lan,müh_lager,ltz_vk_müh,ros_lager,ltz_vk_ros
0,,Summe,Summe,Summe,Summe,,Summe,2023-06-03,,,1084309.0,763512.0,2023-06-03,61066.0,2023-06-02,31034.0,2023-06-02,24471.0,2023-06-02,55651.0,2023-06-02,39311.0,2023-06-02,31628.0,2023-06-02,37164.0,2023-06-02,40366.0,2023-06-02
1,430.0,08.607.83,000,"N CORSA D 1,0L 44KW BJ 2006",8,1.0,L20,2018-11-07,,,0.0,0.0,NaT,0.0,2018-11-07,0.0,2017-08-08,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,2017-08-31,0.0,NaT,0.0,2014-03-26
2,430.0,08.607.81,000,"M CORSA D 1,4L 66KW BJ 2006",8,1.0,L20,2021-08-04,,,0.0,0.0,2021-08-04,0.0,2017-07-10,0.0,2021-04-26,0.0,2021-04-28,0.0,2018-09-13,0.0,2019-07-31,0.0,2016-06-24,0.0,NaT,0.0,2015-05-04
3,430.0,08.607.85,000,"N CORSA D 1,4L 66KW BJ 2006",8,1.0,L20,2021-07-14,,,0.0,0.0,2021-07-14,0.0,NaT,0.0,2021-04-26,0.0,NaT,0.0,2020-09-29,0.0,2019-07-31,0.0,2018-11-07,0.0,NaT,0.0,NaT
4,430.0,08.607.80,000,"M CORSA D 1,0L 44KW BJ 2006",8,1.0,L20,2017-12-14,,,0.0,0.0,2013-09-24,0.0,NaT,0.0,2015-05-15,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,NaT,0.0,NaT,0.0,2015-10-19


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357830 entries, 0 to 357829
Data columns (total 29 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   lfnr                32767 non-null   float64       
 1   artnr               357830 non-null  object        
 2   index               357830 non-null  object        
 3   beschreibung        357830 non-null  object        
 4   bestellkennzeichen  357830 non-null  object        
 5   verp_einheit        32767 non-null   float64       
 6   stat_gruppe         357830 non-null  object        
 7   ltz_vk_ges          338129 non-null  datetime64[ns]
 8   basispreis          0 non-null       object        
 9   basispr_summe       0 non-null       object        
 10  gesamt_lager        325062 non-null  float64       
 11  wen_lager           325062 non-null  float64       
 12  ltz_vk_wen          204993 non-null  datetime64[ns]
 13  rgb_lager           98304 non

None

In [12]:
df_lagerbestand.describe()

Unnamed: 0,lfnr,verp_einheit,gesamt_lager,wen_lager,rgb_lager,amb_lager,cha_lager,str_lager,pas_lager,lan_lager,müh_lager,ros_lager
count,32767.0,32767.0,325062.0,325062.0,98304.0,98304.0,32768.0,95686.0,98304.0,62918.0,32768.0,98304.0
mean,231.4,8.36,6.59,4.65,0.72,0.38,0.77,0.8,0.48,0.63,1.17,0.48
std,147.74,25.56,1903.0,1340.31,194.94,99.19,135.18,180.02,125.75,126.21,205.3,129.34
min,1.0,0.0,-8399.0,-3453.0,-1305.0,-1328.0,-4.0,-1010.0,-2487.0,-15.0,-1.0,-2798.0
25%,134.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,137.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,419.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,900.0,100.0,1084309.0,763512.0,61066.0,31034.0,24471.0,55651.0,39311.0,31628.0,37164.0,40366.0


## df_lieferanten

In [13]:
# Adjust column names

df_lieferanten.columns = df_lieferanten.columns.str.lower()
df_lieferanten.columns = [col.replace(" ", "_") for col in df_lieferanten.columns.tolist()]
df_lieferanten.columns = [col.replace(".", "") for col in df_lieferanten.columns.tolist()]
df_lieferanten.head()

Unnamed: 0,lfnr,beschreibung
0,,
1,0.0,BOSCH
2,1.0,HELLA
3,2.0,BOSCH-TELECOM
4,3.0,BASF KÜHLERFROSTSCHUTZ


In [14]:
df_lieferanten.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 379 entries, 0 to 378
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lfnr          378 non-null    float64
 1   beschreibung  376 non-null    object 
dtypes: float64(1), object(1)
memory usage: 6.0+ KB


## df_verkaeufe

In [15]:
# Adjust column names

df_verkaeufe.columns = df_verkaeufe.columns.str.lower()
df_verkaeufe.columns = [col.replace(" ", "_") for col in df_verkaeufe.columns.tolist()]
df_verkaeufe.columns = [col.replace(".", "") for col in df_verkaeufe.columns.tolist()]
df_verkaeufe.head()

Unnamed: 0,lfr,artnr,ind,beschreibung,gesamt,wen,rgb,str,pas,amb,cha,lan,müh,ros
0,,,,,1735669.0,1031753.0,91482.0,140461.0,88082.0,94059.0,40096.0,91864.0,83465.0,74405.0
1,0.0,1928498680.0,0.0,BUCHSENKONTAKT,38400.0,38400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,52.0,50266496.0,0.0,"KABELBAND 300X4,8MM SCHWARZ",28200.0,25000.0,0.0,0.0,0.0,0.0,1400.0,0.0,1500.0,300.0
3,0.0,1928405459.0,0.0,BLINDKONTAKT,24000.0,24000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,52.0,50266493.0,0.0,"KABELBAND 390X4,8MM SCHWARZ",27722.0,22900.0,0.0,0.0,0.0,210.0,702.0,1000.0,1600.0,1310.0


In [16]:
# Change names of selected columns
new_columns = {'lfr':'lfnr',
               'ind': 'index',
               'gesamt':'gesamt_vk',
               'wen':'wen_vk',
               'rgb':'rgb_vk',
               'str':'str_vk',
               'pas':'pas_vk',
               'amb':'amb_vk',
               'cha':'cha_vk',
               'lan':'lan_vk',
               'müh':'müh_vk',
               'ros':'ros_vk'}

df_verkaeufe = df_verkaeufe.rename(columns=new_columns)

In [17]:
display(df_verkaeufe.head())
display(df_verkaeufe.info())

Unnamed: 0,lfnr,artnr,index,beschreibung,gesamt_vk,wen_vk,rgb_vk,str_vk,pas_vk,amb_vk,cha_vk,lan_vk,müh_vk,ros_vk
0,,,,,1735669.0,1031753.0,91482.0,140461.0,88082.0,94059.0,40096.0,91864.0,83465.0,74405.0
1,0.0,1928498680.0,0.0,BUCHSENKONTAKT,38400.0,38400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,52.0,50266496.0,0.0,"KABELBAND 300X4,8MM SCHWARZ",28200.0,25000.0,0.0,0.0,0.0,0.0,1400.0,0.0,1500.0,300.0
3,0.0,1928405459.0,0.0,BLINDKONTAKT,24000.0,24000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,52.0,50266493.0,0.0,"KABELBAND 390X4,8MM SCHWARZ",27722.0,22900.0,0.0,0.0,0.0,210.0,702.0,1000.0,1600.0,1310.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66010 entries, 0 to 66009
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lfnr          66009 non-null  float64
 1   artnr         66009 non-null  object 
 2   index         66009 non-null  float64
 3   beschreibung  66009 non-null  object 
 4   gesamt_vk     65536 non-null  float64
 5   wen_vk        65536 non-null  float64
 6   rgb_vk        65536 non-null  float64
 7   str_vk        65536 non-null  float64
 8   pas_vk        65536 non-null  float64
 9   amb_vk        65536 non-null  float64
 10  cha_vk        65536 non-null  float64
 11  lan_vk        65536 non-null  float64
 12  müh_vk        65536 non-null  float64
 13  ros_vk        65536 non-null  float64
dtypes: float64(12), object(2)
memory usage: 7.1+ MB


None

# Merging tables to one master table (df_master)