# Setup

In [1]:
# Import libraries/modules

import pandas as pd
import sqlalchemy
import sql_functions as sf

In [2]:
# Define global variables

sql_config = sf.get_sql_config() # Function loads credentials from a .env file and returns a dictionary with credentials
engine = sqlalchemy.create_engine('postgresql://user:pass@host/database', # Creates a connection object called engine
                                  connect_args=sql_config)
schema = 'capstone_kueblbeck' # Schema in our Postgresql database

In [3]:
# Other settings

pd.options.display.max_columns = 30
pd.options.display.float_format = "{:,.2f}".format

# Loading Dataframes

In [4]:
sql_query = f'select * from {schema}.lagerbestand'
df_lagerbestand = sf.get_dataframe(sql_query)
df_lagerbestand.head()

Unnamed: 0,Lfnr,Artnr,Index,Beschr.,BKZ,VPE,St.gr.,Ltz. VK ges.,Basispreis,Basispr. Summe,Gesamt,WEN,Ltz. VK WEN,RGB,Ltz. VK RGB,AMB,Ltz. VK AMB,CHA,Ltz. VK CHA,STR,Ltz. VK STR,PAS,Ltz. VK PAS,LAN,Ltz. VK LAN,MÜH,Ltz. VK MÜH,ROS,Ltz. VK ROS
0,430,08.607.83,0,"N CORSA D 1,0L 44KW BJ 2006",8,1,L20,2018-11-07,75.36,0.0,0.0,0.0,NaT,0.0,2018-11-07,0.0,2017-08-08,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,2017-08-31,0.0,NaT,0.0,2014-03-26
1,430,08.607.81,0,"M CORSA D 1,4L 66KW BJ 2006",8,1,L20,2021-08-04,93.63,0.0,0.0,0.0,2021-08-04,0.0,2017-07-10,0.0,2021-04-26,0.0,2021-04-28,0.0,2018-09-13,0.0,2019-07-31,0.0,2016-06-24,0.0,NaT,0.0,2015-05-04
2,430,08.607.85,0,"N CORSA D 1,4L 66KW BJ 2006",8,1,L20,2021-07-14,76.95,0.0,0.0,0.0,2021-07-14,0.0,NaT,0.0,2021-04-26,0.0,NaT,0.0,2020-09-29,0.0,2019-07-31,0.0,2018-11-07,0.0,NaT,0.0,NaT
3,430,08.607.80,0,"M CORSA D 1,0L 44KW BJ 2006",8,1,L20,2017-12-14,80.07,0.0,0.0,0.0,2013-09-24,0.0,NaT,0.0,2015-05-15,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,NaT,0.0,NaT,0.0,2015-10-19
4,430,08.607.87,0,"N CORSA D 1,3L CDTI 55KW BJ",8,1,L20,2021-01-08,49.98,0.0,0.0,0.0,2021-01-08,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT


In [5]:
sql_query = f'select * from {schema}.lieferanten'
df_lieferanten = sf.get_dataframe(sql_query)
df_lieferanten.head()

Unnamed: 0,Lfnr,Beschreibung
0,0,BOSCH
1,1,HELLA
2,2,BOSCH-TELECOM
3,3,BASF KÜHLERFROSTSCHUTZ
4,4,ALIK


In [6]:
sql_query = f'select * from {schema}.verkäufe'
df_verkaeufe = sf.get_dataframe(sql_query)
df_verkaeufe.head()

Unnamed: 0,Lfr.,Art.nr.,Ind.,Beschreibung,Gesamt,WEN,RGB,STR,PAS,AMB,CHA,LAN,MÜH,ROS
0,0,1928498680,0.0,BUCHSENKONTAKT,38400.0,38400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,52,50266496,0.0,"KABELBAND 300X4,8MM SCHWARZ",28200.0,25000.0,0.0,0.0,0.0,0.0,1400.0,0.0,1500.0,300.0
2,0,1928405459,0.0,BLINDKONTAKT,24000.0,24000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,52,50266493,0.0,"KABELBAND 390X4,8MM SCHWARZ",27722.0,22900.0,0.0,0.0,0.0,210.0,702.0,1000.0,1600.0,1310.0
4,40,9999900000KH,0.0,FREMDARTIKELSAMMELNUMMER,19853.0,16089.0,84.0,568.0,23.0,215.0,24.0,2226.0,397.0,227.0


# Data Cleaning

## df_lagerbestand

In [7]:
# Adjust column names

df_lagerbestand.columns = df_lagerbestand.columns.str.lower()
df_lagerbestand.columns = [col.replace(" ", "_") for col in df_lagerbestand.columns.tolist()]
df_lagerbestand.columns = [col.replace(".", "") for col in df_lagerbestand.columns.tolist()]
df_lagerbestand.head()

Unnamed: 0,lfnr,artnr,index,beschr,bkz,vpe,stgr,ltz_vk_ges,basispreis,basispr_summe,gesamt,wen,ltz_vk_wen,rgb,ltz_vk_rgb,amb,ltz_vk_amb,cha,ltz_vk_cha,str,ltz_vk_str,pas,ltz_vk_pas,lan,ltz_vk_lan,müh,ltz_vk_müh,ros,ltz_vk_ros
0,430,08.607.83,0,"N CORSA D 1,0L 44KW BJ 2006",8,1,L20,2018-11-07,75.36,0.0,0.0,0.0,NaT,0.0,2018-11-07,0.0,2017-08-08,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,2017-08-31,0.0,NaT,0.0,2014-03-26
1,430,08.607.81,0,"M CORSA D 1,4L 66KW BJ 2006",8,1,L20,2021-08-04,93.63,0.0,0.0,0.0,2021-08-04,0.0,2017-07-10,0.0,2021-04-26,0.0,2021-04-28,0.0,2018-09-13,0.0,2019-07-31,0.0,2016-06-24,0.0,NaT,0.0,2015-05-04
2,430,08.607.85,0,"N CORSA D 1,4L 66KW BJ 2006",8,1,L20,2021-07-14,76.95,0.0,0.0,0.0,2021-07-14,0.0,NaT,0.0,2021-04-26,0.0,NaT,0.0,2020-09-29,0.0,2019-07-31,0.0,2018-11-07,0.0,NaT,0.0,NaT
3,430,08.607.80,0,"M CORSA D 1,0L 44KW BJ 2006",8,1,L20,2017-12-14,80.07,0.0,0.0,0.0,2013-09-24,0.0,NaT,0.0,2015-05-15,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,NaT,0.0,NaT,0.0,2015-10-19
4,430,08.607.87,0,"N CORSA D 1,3L CDTI 55KW BJ",8,1,L20,2021-01-08,49.98,0.0,0.0,0.0,2021-01-08,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT


In [8]:
df_lagerbestand.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357829 entries, 0 to 357828
Data columns (total 29 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   lfnr           357829 non-null  int64         
 1   artnr          357829 non-null  object        
 2   index          357829 non-null  object        
 3   beschr         357829 non-null  object        
 4   bkz            357829 non-null  object        
 5   vpe            357829 non-null  int64         
 6   stgr           357829 non-null  object        
 7   ltz_vk_ges     338128 non-null  datetime64[ns]
 8   basispreis     357829 non-null  float64       
 9   basispr_summe  357829 non-null  float64       
 10  gesamt         325061 non-null  float64       
 11  wen            325061 non-null  float64       
 12  ltz_vk_wen     204992 non-null  datetime64[ns]
 13  rgb            98303 non-null   float64       
 14  ltz_vk_rgb     124822 non-null  datetime64[ns]
 15  

In [9]:
# # Change selected number columns from string to float type
# float_columns = ["lfnr", "vpe", "basispreis", "basispr_summe", "gesamt"]

# for column in float_columns:
#     df_lagerbestand[column] = pd.to_numeric(df_lagerbestand[column].str.replace('.', '').str.replace(',', '.'), errors='coerce')

In [10]:
# Change names of selected columns
new_columns = {'beschr':'beschreibung',
               'bkz':'bestellkennzeichen',
               'vpe':'verp_einheit',
               'stgr':'stat_gruppe',
               'gesamt':'gesamt_lager',
               'wen':'wen_lager',
               'rgb':'rgb_lager',
               'str':'str_lager',
               'pas':'pas_lager',
               'amb':'amb_lager',
               'cha':'cha_lager',
               'lan':'lan_lager',
               'müh':'müh_lager',
               'ros':'ros_lager'}

df_lagerbestand = df_lagerbestand.rename(columns=new_columns)

In [65]:
df_lagerbestand['index'] = df_lagerbestand['index'].astype(int)

In [66]:
display(df_lagerbestand.head())
display(df_lagerbestand.info())

Unnamed: 0,lfnr,artnr,index,beschreibung,bestellkennzeichen,verp_einheit,stat_gruppe,ltz_vk_ges,basispreis,basispr_summe,gesamt_lager,wen_lager,ltz_vk_wen,rgb_lager,ltz_vk_rgb,amb_lager,ltz_vk_amb,cha_lager,ltz_vk_cha,str_lager,ltz_vk_str,pas_lager,ltz_vk_pas,lan_lager,ltz_vk_lan,müh_lager,ltz_vk_müh,ros_lager,ltz_vk_ros
0,430,08.607.83,0,"N CORSA D 1,0L 44KW BJ 2006",8,1,L20,2018-11-07,75.36,0.0,0.0,0.0,NaT,0.0,2018-11-07,0.0,2017-08-08,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,2017-08-31,0.0,NaT,0.0,2014-03-26
1,430,08.607.81,0,"M CORSA D 1,4L 66KW BJ 2006",8,1,L20,2021-08-04,93.63,0.0,0.0,0.0,2021-08-04,0.0,2017-07-10,0.0,2021-04-26,0.0,2021-04-28,0.0,2018-09-13,0.0,2019-07-31,0.0,2016-06-24,0.0,NaT,0.0,2015-05-04
2,430,08.607.85,0,"N CORSA D 1,4L 66KW BJ 2006",8,1,L20,2021-07-14,76.95,0.0,0.0,0.0,2021-07-14,0.0,NaT,0.0,2021-04-26,0.0,NaT,0.0,2020-09-29,0.0,2019-07-31,0.0,2018-11-07,0.0,NaT,0.0,NaT
3,430,08.607.80,0,"M CORSA D 1,0L 44KW BJ 2006",8,1,L20,2017-12-14,80.07,0.0,0.0,0.0,2013-09-24,0.0,NaT,0.0,2015-05-15,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,NaT,0.0,NaT,0.0,2015-10-19
4,430,08.607.87,0,"N CORSA D 1,3L CDTI 55KW BJ",8,1,L20,2021-01-08,49.98,0.0,0.0,0.0,2021-01-08,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357829 entries, 0 to 357828
Data columns (total 29 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   lfnr                357829 non-null  int64         
 1   artnr               357829 non-null  object        
 2   index               357829 non-null  int64         
 3   beschreibung        357829 non-null  object        
 4   bestellkennzeichen  357829 non-null  object        
 5   verp_einheit        357829 non-null  int64         
 6   stat_gruppe         357829 non-null  object        
 7   ltz_vk_ges          338128 non-null  datetime64[ns]
 8   basispreis          357829 non-null  float64       
 9   basispr_summe       357829 non-null  float64       
 10  gesamt_lager        325061 non-null  float64       
 11  wen_lager           325061 non-null  float64       
 12  ltz_vk_wen          204992 non-null  datetime64[ns]
 13  rgb_lager           98303 non

None

In [12]:
df_lagerbestand.describe()

Unnamed: 0,lfnr,verp_einheit,basispreis,basispr_summe,gesamt_lager,wen_lager,rgb_lager,amb_lager,cha_lager,str_lager,pas_lager,lan_lager,müh_lager,ros_lager
count,357829.0,357829.0,357829.0,357829.0,325061.0,325061.0,98303.0,98303.0,32767.0,95685.0,98303.0,62917.0,32767.0,98303.0
mean,130.69,3.2,61.87,27.87,3.26,2.3,0.1,0.06,0.03,0.21,0.09,0.13,0.04,0.07
std,126.75,27.26,141.54,336.41,67.21,55.61,8.24,6.41,0.28,6.47,9.68,5.54,0.43,12.37
min,0.0,0.0,0.0,-68931.0,-8399.0,-3453.0,-1305.0,-1328.0,-4.0,-1010.0,-2487.0,-15.0,-1.0,-2798.0
25%,25.0,0.0,8.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,110.0,1.0,26.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,182.0,1.0,64.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,990.0,5000.0,14214.0,52098.03,15835.0,14741.0,1900.0,632.0,11.0,1011.0,1354.0,1030.0,35.0,1532.0


## df_lieferanten

In [13]:
# Adjust column names

df_lieferanten.columns = df_lieferanten.columns.str.lower()
df_lieferanten.columns = [col.replace(" ", "_") for col in df_lieferanten.columns.tolist()]
df_lieferanten.columns = [col.replace(".", "") for col in df_lieferanten.columns.tolist()]
df_lieferanten.head()

Unnamed: 0,lfnr,beschreibung
0,0,BOSCH
1,1,HELLA
2,2,BOSCH-TELECOM
3,3,BASF KÜHLERFROSTSCHUTZ
4,4,ALIK


In [20]:
df_lieferanten = df_lieferanten.rename(columns={'beschreibung':'lieferant'})

In [22]:
df_lieferanten.info()
df_lieferanten.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378 entries, 0 to 377
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   lfnr       378 non-null    int64 
 1   lieferant  376 non-null    object
dtypes: int64(1), object(1)
memory usage: 6.0+ KB


Unnamed: 0,lfnr,lieferant
0,0,BOSCH
1,1,HELLA
2,2,BOSCH-TELECOM
3,3,BASF KÜHLERFROSTSCHUTZ
4,4,ALIK


## df_verkaeufe

In [15]:
# Adjust column names

df_verkaeufe.columns = df_verkaeufe.columns.str.lower()
df_verkaeufe.columns = [col.replace(" ", "_") for col in df_verkaeufe.columns.tolist()]
df_verkaeufe.columns = [col.replace(".", "") for col in df_verkaeufe.columns.tolist()]
df_verkaeufe.head()

Unnamed: 0,lfr,artnr,ind,beschreibung,gesamt,wen,rgb,str,pas,amb,cha,lan,müh,ros
0,0,1928498680,0.0,BUCHSENKONTAKT,38400.0,38400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,52,50266496,0.0,"KABELBAND 300X4,8MM SCHWARZ",28200.0,25000.0,0.0,0.0,0.0,0.0,1400.0,0.0,1500.0,300.0
2,0,1928405459,0.0,BLINDKONTAKT,24000.0,24000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,52,50266493,0.0,"KABELBAND 390X4,8MM SCHWARZ",27722.0,22900.0,0.0,0.0,0.0,210.0,702.0,1000.0,1600.0,1310.0
4,40,9999900000KH,0.0,FREMDARTIKELSAMMELNUMMER,19853.0,16089.0,84.0,568.0,23.0,215.0,24.0,2226.0,397.0,227.0


In [16]:
# Change names of selected columns
new_columns = {'lfr':'lfnr',
               'ind': 'index',
               'gesamt':'gesamt_vk',
               'wen':'wen_vk',
               'rgb':'rgb_vk',
               'str':'str_vk',
               'pas':'pas_vk',
               'amb':'amb_vk',
               'cha':'cha_vk',
               'lan':'lan_vk',
               'müh':'müh_vk',
               'ros':'ros_vk'}

df_verkaeufe = df_verkaeufe.rename(columns=new_columns)

In [67]:
df_verkaeufe['index'] = df_verkaeufe['index'].astype(int)

In [68]:
display(df_verkaeufe.head())
display(df_verkaeufe.info())

Unnamed: 0,lfnr,artnr,index,beschreibung,gesamt_vk,wen_vk,rgb_vk,str_vk,pas_vk,amb_vk,cha_vk,lan_vk,müh_vk,ros_vk
0,0,1928498680,0,BUCHSENKONTAKT,38400.0,38400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,52,50266496,0,"KABELBAND 300X4,8MM SCHWARZ",28200.0,25000.0,0.0,0.0,0.0,0.0,1400.0,0.0,1500.0,300.0
2,0,1928405459,0,BLINDKONTAKT,24000.0,24000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,52,50266493,0,"KABELBAND 390X4,8MM SCHWARZ",27722.0,22900.0,0.0,0.0,0.0,210.0,702.0,1000.0,1600.0,1310.0
4,40,9999900000KH,0,FREMDARTIKELSAMMELNUMMER,19853.0,16089.0,84.0,568.0,23.0,215.0,24.0,2226.0,397.0,227.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66009 entries, 0 to 66008
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lfnr          66009 non-null  int64  
 1   artnr         66009 non-null  object 
 2   index         66009 non-null  int64  
 3   beschreibung  66009 non-null  object 
 4   gesamt_vk     65535 non-null  float64
 5   wen_vk        65535 non-null  float64
 6   rgb_vk        65535 non-null  float64
 7   str_vk        65535 non-null  float64
 8   pas_vk        65535 non-null  float64
 9   amb_vk        65535 non-null  float64
 10  cha_vk        65535 non-null  float64
 11  lan_vk        65535 non-null  float64
 12  müh_vk        65535 non-null  float64
 13  ros_vk        65535 non-null  float64
dtypes: float64(10), int64(2), object(2)
memory usage: 7.1+ MB


None

# Merging tables to one master table (df_master)

In [74]:
# Merging df_lagerbestand and df_lieferanten
df_master = df_lagerbestand.merge(df_lieferanten, how='left', on='lfnr')
df_master.head()

Unnamed: 0,lfnr,artnr,index,beschreibung,bestellkennzeichen,verp_einheit,stat_gruppe,ltz_vk_ges,basispreis,basispr_summe,gesamt_lager,wen_lager,ltz_vk_wen,rgb_lager,ltz_vk_rgb,amb_lager,ltz_vk_amb,cha_lager,ltz_vk_cha,str_lager,ltz_vk_str,pas_lager,ltz_vk_pas,lan_lager,ltz_vk_lan,müh_lager,ltz_vk_müh,ros_lager,ltz_vk_ros,lieferant
0,430,08.607.83,0,"N CORSA D 1,0L 44KW BJ 2006",8,1,L20,2018-11-07,75.36,0.0,0.0,0.0,NaT,0.0,2018-11-07,0.0,2017-08-08,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,2017-08-31,0.0,NaT,0.0,2014-03-26,Eberspächer Abgas
1,430,08.607.81,0,"M CORSA D 1,4L 66KW BJ 2006",8,1,L20,2021-08-04,93.63,0.0,0.0,0.0,2021-08-04,0.0,2017-07-10,0.0,2021-04-26,0.0,2021-04-28,0.0,2018-09-13,0.0,2019-07-31,0.0,2016-06-24,0.0,NaT,0.0,2015-05-04,Eberspächer Abgas
2,430,08.607.85,0,"N CORSA D 1,4L 66KW BJ 2006",8,1,L20,2021-07-14,76.95,0.0,0.0,0.0,2021-07-14,0.0,NaT,0.0,2021-04-26,0.0,NaT,0.0,2020-09-29,0.0,2019-07-31,0.0,2018-11-07,0.0,NaT,0.0,NaT,Eberspächer Abgas
3,430,08.607.80,0,"M CORSA D 1,0L 44KW BJ 2006",8,1,L20,2017-12-14,80.07,0.0,0.0,0.0,2013-09-24,0.0,NaT,0.0,2015-05-15,0.0,2016-04-29,0.0,2017-12-14,0.0,NaT,0.0,NaT,0.0,NaT,0.0,2015-10-19,Eberspächer Abgas
4,430,08.607.87,0,"N CORSA D 1,3L CDTI 55KW BJ",8,1,L20,2021-01-08,49.98,0.0,0.0,0.0,2021-01-08,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,0.0,NaT,Eberspächer Abgas


In [75]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357829 entries, 0 to 357828
Data columns (total 30 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   lfnr                357829 non-null  int64         
 1   artnr               357829 non-null  object        
 2   index               357829 non-null  int64         
 3   beschreibung        357829 non-null  object        
 4   bestellkennzeichen  357829 non-null  object        
 5   verp_einheit        357829 non-null  int64         
 6   stat_gruppe         357829 non-null  object        
 7   ltz_vk_ges          338128 non-null  datetime64[ns]
 8   basispreis          357829 non-null  float64       
 9   basispr_summe       357829 non-null  float64       
 10  gesamt_lager        325061 non-null  float64       
 11  wen_lager           325061 non-null  float64       
 12  ltz_vk_wen          204992 non-null  datetime64[ns]
 13  rgb_lager           98303 non

In [63]:
duplicateRows = df_master[df_master['artnr'].duplicated(keep=False)]

In [42]:
duplicateRows.sort_values(by='artnr')

Unnamed: 0,lfnr,artnr,index,beschreibung,bestellkennzeichen,verp_einheit,stat_gruppe,ltz_vk_ges,basispreis,basispr_summe,gesamt_lager,wen_lager,ltz_vk_wen,rgb_lager,ltz_vk_rgb,amb_lager,ltz_vk_amb,cha_lager,ltz_vk_cha,str_lager,ltz_vk_str,pas_lager,ltz_vk_pas,lan_lager,ltz_vk_lan,müh_lager,ltz_vk_müh,ros_lager,ltz_vk_ros,lieferant
43971,120,000-094,0,STÜTZLASTSCHILDER (130KG),-,1,LCE,2000-04-01,0.15,0.00,0.00,0.00,2000-04-01,0.00,NaT,,2000-04-01,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,BOSAL
43972,420,000-094,0,STÜTZLASTSCHILDER (130KG),8,1,LCE,NaT,0.30,5.10,17.00,17.00,NaT,0.00,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,ACPS AFTERMARKET
43974,120,000-124,0,STÜTZLASTSCHILDER (50KG),-,1,LCD,2008-09-10,0.30,0.00,0.00,0.00,2007-09-19,0.00,2003-04-01,,2006-02-27,,NaT,,NaT,,NaT,,NaT,,2008-09-10,,NaT,BOSAL
43975,420,000-124,0,STÜTZLASTSCHILDER (50KG),AN,1,LCE,2021-08-20,0.33,0.99,3.00,3.00,NaT,0.00,NaT,,2021-08-20,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,ACPS AFTERMARKET
43969,120,000-264,0,STÜTZLASTSCHILDER (75KG),-,1,LCE,2014-12-17,0.30,0.00,0.00,0.00,2014-12-17,0.00,2010-03-05,,2007-06-12,,NaT,,2007-12-05,,NaT,,2005-04-06,,NaT,,NaT,BOSAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92747,240,WERBEPAKET2,0,WERBEPAK.AUTOGAS FLYER/POSTK/,8,1,ZGA,2008-08-19,119.00,0.00,0.00,0.00,NaT,,NaT,0.00,2008-08-19,,NaT,,NaT,0.00,NaT,,NaT,,NaT,0.00,NaT,VMA
54478,90,XDB100,90,Zündverteiler,8,1,LQSON,2012-03-21,88.88,0.00,0.00,0.00,NaT,0.00,2012-01-17,,NaT,,NaT,,2008-05-27,,NaT,,2012-03-21,,NaT,,NaT,QUINTON HAZELL
54479,90,XDB100,0,ZÜNDVERTEILER,8,0,LQSON,NaT,89.55,0.00,0.00,0.00,NaT,0.00,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,,NaT,QUINTON HAZELL
54482,90,XDB102,90,Zündverteiler,8,1,LQSON,2011-09-21,44.55,0.00,0.00,0.00,NaT,0.00,2007-12-28,,NaT,,NaT,,NaT,,2011-09-21,,NaT,,NaT,,NaT,QUINTON HAZELL


In [76]:
# Merging df_master with df_verkaeufe
df_master = df_master.merge(df_verkaeufe, how='left', on=['lfnr', 'artnr', 'index', 'beschreibung'])
df_master

Unnamed: 0,lfnr,artnr,index,beschreibung,bestellkennzeichen,verp_einheit,stat_gruppe,ltz_vk_ges,basispreis,basispr_summe,gesamt_lager,wen_lager,ltz_vk_wen,rgb_lager,ltz_vk_rgb,...,müh_lager,ltz_vk_müh,ros_lager,ltz_vk_ros,lieferant,gesamt_vk,wen_vk,rgb_vk,str_vk,pas_vk,amb_vk,cha_vk,lan_vk,müh_vk,ros_vk
0,430,08.607.83,0,"N CORSA D 1,0L 44KW BJ 2006",8,1,L20,2018-11-07,75.36,0.00,0.00,0.00,NaT,0.00,2018-11-07,...,0.00,NaT,0.00,2014-03-26,Eberspächer Abgas,,,,,,,,,,
1,430,08.607.81,0,"M CORSA D 1,4L 66KW BJ 2006",8,1,L20,2021-08-04,93.63,0.00,0.00,0.00,2021-08-04,0.00,2017-07-10,...,0.00,NaT,0.00,2015-05-04,Eberspächer Abgas,,,,,,,,,,
2,430,08.607.85,0,"N CORSA D 1,4L 66KW BJ 2006",8,1,L20,2021-07-14,76.95,0.00,0.00,0.00,2021-07-14,0.00,NaT,...,0.00,NaT,0.00,NaT,Eberspächer Abgas,,,,,,,,,,
3,430,08.607.80,0,"M CORSA D 1,0L 44KW BJ 2006",8,1,L20,2017-12-14,80.07,0.00,0.00,0.00,2013-09-24,0.00,NaT,...,0.00,NaT,0.00,2015-10-19,Eberspächer Abgas,,,,,,,,,,
4,430,08.607.87,0,"N CORSA D 1,3L CDTI 55KW BJ",8,1,L20,2021-01-08,49.98,0.00,0.00,0.00,2021-01-08,0.00,NaT,...,0.00,NaT,0.00,NaT,Eberspächer Abgas,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357824,0,0986479E79,0,BREMSSCHEIBE,VO,0,00RBABC,2023-05-12,78.93,1341.81,17.00,3.00,2023-03-15,,2022-01-05,...,,NaT,,NaT,BOSCH,8.00,4.00,0.00,0.00,4.00,0.00,0.00,0.00,0.00,0.00
357825,17,816E85,0,ZÜNDLEITUNG,0,5,L64,2010-06-09,5.82,0.00,0.00,0.00,2010-06-09,,2003-04-01,...,,NaT,,NaT,BREMICKER,,,,,,,,,,
357826,0,0986479E87,0,BREMSSCHEIBE,0,0,00RBABA,2022-05-04,37.76,0.00,0.00,0.00,NaT,,NaT,...,,NaT,,2022-05-04,BOSCH,,,,,,,,,,
357827,0,0986479E91,0,BREMSSCHEIBE,0,0,00RBABA,2023-04-19,26.46,105.84,4.00,2.00,NaT,,NaT,...,,NaT,,NaT,BOSCH,,,,,,,,,,


In [77]:
df_master.describe()

Unnamed: 0,lfnr,index,verp_einheit,basispreis,basispr_summe,gesamt_lager,wen_lager,rgb_lager,amb_lager,cha_lager,str_lager,pas_lager,lan_lager,müh_lager,ros_lager,gesamt_vk,wen_vk,rgb_vk,str_vk,pas_vk,amb_vk,cha_vk,lan_vk,müh_vk,ros_vk
count,357829.0,357829.0,357829.0,357829.0,357829.0,325061.0,325061.0,98303.0,98303.0,32767.0,95685.0,98303.0,62917.0,32767.0,98303.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0,65532.0
mean,130.69,4.6,3.2,61.87,27.87,3.26,2.3,0.1,0.06,0.03,0.21,0.09,0.13,0.04,0.07,26.51,15.76,1.4,2.14,1.34,1.44,0.61,1.4,1.27,1.14
std,126.75,19.84,27.26,141.54,336.41,67.21,55.61,8.24,6.41,0.28,6.47,9.68,5.54,0.43,12.37,346.97,300.32,15.27,36.73,18.84,15.39,9.63,29.63,22.92,13.82
min,0.0,0.0,0.0,0.0,-68931.0,-8399.0,-3453.0,-1305.0,-1328.0,-4.0,-1010.0,-2487.0,-15.0,-1.0,-2798.0,-187.0,0.0,-99.0,-44.0,-10.0,-197.0,-6.0,-20.0,-200.0,-16.0
25%,25.0,0.0,0.0,8.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,110.0,0.0,1.0,26.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,182.0,0.0,1.0,64.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,990.0,400.0,5000.0,14214.0,52098.03,15835.0,14741.0,1900.0,632.0,11.0,1011.0,1354.0,1030.0,35.0,1532.0,38400.0,38400.0,1289.0,6000.0,2300.0,1371.0,1400.0,4100.0,2400.0,1350.0


In [78]:
df_verkaeufe.describe()

Unnamed: 0,lfnr,index,gesamt_vk,wen_vk,rgb_vk,str_vk,pas_vk,amb_vk,cha_vk,lan_vk,müh_vk,ros_vk
count,66009.0,66009.0,65535.0,65535.0,65535.0,65535.0,65535.0,65535.0,65535.0,65535.0,65535.0,65535.0
mean,167.49,2.23,26.5,15.76,1.4,2.14,1.34,1.44,0.61,1.4,1.27,1.14
std,155.96,14.0,346.97,300.31,15.26,36.73,18.84,15.39,9.63,29.63,22.92,13.82
min,0.0,0.0,-187.0,0.0,-99.0,-44.0,-10.0,-197.0,-6.0,-20.0,-200.0,-16.0
25%,37.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,134.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,205.0,0.0,6.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,800.0,90.0,38400.0,38400.0,1289.0,6000.0,2300.0,1371.0,1400.0,4100.0,2400.0,1350.0


In [80]:
df_master.columns = ['lfnr','lieferant', 'artnr', 'index', 'beschreibung', 'bestellkennzeichen',
       'verp_einheit', 'stat_gruppe', 'ltz_vk_ges', 'basispreis',
       'basispr_summe', 'gesamt_lager', 'wen_lager', 'ltz_vk_wen', 'rgb_lager',
       'ltz_vk_rgb', 'amb_lager', 'ltz_vk_amb', 'cha_lager', 'ltz_vk_cha',
       'str_lager', 'ltz_vk_str', 'pas_lager', 'ltz_vk_pas', 'lan_lager',
       'ltz_vk_lan', 'müh_lager', 'ltz_vk_müh', 'ros_lager', 'ltz_vk_ros',
        'gesamt_vk', 'wen_vk', 'rgb_vk', 'str_vk', 'pas_vk',
       'amb_vk', 'cha_vk', 'lan_vk', 'müh_vk', 'ros_vk']

In [85]:
df_master.columns

Index(['lfnr', 'lieferant', 'artnr', 'index', 'beschreibung',
       'bestellkennzeichen', 'verp_einheit', 'stat_gruppe', 'ltz_vk_ges',
       'basispreis', 'basispr_summe', 'gesamt_lager', 'wen_lager',
       'ltz_vk_wen', 'rgb_lager', 'ltz_vk_rgb', 'amb_lager', 'ltz_vk_amb',
       'cha_lager', 'ltz_vk_cha', 'str_lager', 'ltz_vk_str', 'pas_lager',
       'ltz_vk_pas', 'lan_lager', 'ltz_vk_lan', 'müh_lager', 'ltz_vk_müh',
       'ros_lager', 'ltz_vk_ros', 'gesamt_vk', 'wen_vk', 'rgb_vk', 'str_vk',
       'pas_vk', 'amb_vk', 'cha_vk', 'lan_vk', 'müh_vk', 'ros_vk'],
      dtype='object')