In [101]:
import pandas as pd

In [102]:
# function to read txt data files and convert them to proper csv files
# txtFile: input filename (including directory if applicable)
# csvFile: output filename (including directory if applicable)
# vtabchar: vertical tab character in the original file (to be replaced with newline command '\n')
# delim: delimiter character used in the original file (to be replaced with comma)
def txt2csv(txtFile, csvFile, vtabchar, delim):
    with open(txtFile, 'r') as file:
        data = file.read().replace(vtabchar, '\n').replace(delim, ',')
    with open(csvFile, 'w') as file:
        file.write(data)    
    return

Set the filename and location for each dataset

In [103]:
# original filename and directory for txt data files
K1DI2_txt = './Data/Komponente/Komponente_K1DI2.txt'
K2LE1_txt = './Data/Komponente/Komponente_K2LE1.txt' 
K2LE2_txt = './Data/Komponente/Komponente_K2LE2.txt'
K2ST1_txt = './Data/Komponente/Komponente_K2ST1.txt'
K3AG2_txt = './Data/Komponente/Komponente_K3AG2.txt'
K7_txt    = './Data/Komponente/Komponente_K7.txt'

# converted txt filename and directory
K1DI2_csv = './Data/Komponente/Komponente_K1DI2.csv'
K2LE1_csv = './Data/Komponente/Komponente_K2LE1.csv'
K2LE2_csv = './Data/Komponente/Komponente_K2LE2.csv'
K2ST1_csv = './Data/Komponente/Komponente_K2ST1.csv'
K3AG2_csv = './Data/Komponente/Komponente_K3AG2.csv'
K7_csv    = './Data/Komponente/Komponente_K7.csv'

# original filename and directory for csv data files
# component data files
K1BE1_csv = './Data/Komponente/Komponente_K1BE1.csv'
K1BE2_csv = './Data/Komponente/Komponente_K1BE2.csv'
K1DI1_csv = './Data/Komponente/Komponente_K1DI1.csv'
K2ST2_csv = './Data/Komponente/Komponente_K2ST2.csv'
K3AG1_csv = './Data/Komponente/Komponente_K3AG1.csv'
K3SG1_csv = './Data/Komponente/Komponente_K3SG1.csv'
K3SG2_csv = './Data/Komponente/Komponente_K3SG2.csv'
K4_csv    = './Data/Komponente/Komponente_K4.csv'
K5_csv    = './Data/Komponente/Komponente_K5.csv'
K6_csv    = './Data/Komponente/Komponente_K6.csv'

In [104]:
# read and convert all the txt data files to csv
txt2csv(K1DI2_txt, K1DI2_csv, '	', '\\')
txt2csv(K2LE1_txt, K2LE1_csv, '', 'II')
txt2csv(K2LE2_txt, K2LE2_csv, '', '\\')
txt2csv(K2ST1_txt, K2ST1_csv, '', '|')
txt2csv(K3AG2_txt, K3AG2_csv, '', '\\')
txt2csv(K7_txt   , K7_csv   , '', '	')

In [105]:
# separate the data arrangements into 4 types, namely A, B, C, and D
A = ['Fehlerhaft_Datum', 'origin']
B = ['Produktionsdatum.x', 'Fehlerhaft_Datum.x', 
     'Produktionsdatum.y', 'Fehlerhaft_Datum.y']
C = ['Produktionsdatum.x', 'Fehlerhaft_Datum.x', 
     'Produktionsdatum.y', 'Fehlerhaft_Datum.y', 
     'Produktionsdatum', 'Fehlerhaft_Datum']
D = ['Produktionsdatum', 'Fehlerhaft_Datum']

# read the converted csv files
K1DI2 = pd.read_csv(K1DI2_csv, parse_dates=A, low_memory=False)
K2LE1 = pd.read_csv(K2LE1_csv, parse_dates=B, low_memory=False)
K2LE2 = pd.read_csv(K2LE2_csv, parse_dates=A, low_memory=False)
K2ST1 = pd.read_csv(K2ST1_csv, parse_dates=D, low_memory=False)
K3AG2 = pd.read_csv(K3AG2_csv, parse_dates=A, low_memory=False)
K7    = pd.read_csv(K7_csv, parse_dates=A, low_memory=False)

# read the rest of the csv files
K1BE1 = pd.read_csv(K1BE1_csv, parse_dates=A, low_memory=False)
K1BE2 = pd.read_csv(K1BE2_csv, parse_dates=A, low_memory=False, sep=';')
K1DI1 = pd.read_csv(K1DI1_csv, parse_dates=C, low_memory=False)
K2ST2 = pd.read_csv(K2ST2_csv, parse_dates=A, low_memory=False, sep=';')
K3AG1 = pd.read_csv(K3AG1_csv, parse_dates=C, low_memory=False)
K3SG1 = pd.read_csv(K3SG1_csv, parse_dates=B, low_memory=False)
K3SG2 = pd.read_csv(K3SG2_csv, parse_dates=A, low_memory=False)
K4    = pd.read_csv(K4_csv, parse_dates=B, low_memory=False, sep=';')
K5    = pd.read_csv(K5_csv, parse_dates=B, low_memory=False)
K6    = pd.read_csv(K6_csv, parse_dates=A, low_memory=False, sep=';')

For datasets with data arrangements of type B and C, we need to consolidate the columns and eliminate the .x and .y suffixes. For type B, the tables are separated into 2, whereas for type C, the tables are separated into 3.

In [109]:
# column names to be renamed for type B
col_names_x = {'Produktionsdatum.x':'Produktionsdatum', 
               'Herstellernummer.x':'Herstellernummer',	
               'Werksnummer.x':'Werksnummer',
               'Fehlerhaft.x':'Fehlerhaft', 
               'Fehlerhaft_Datum.x':'Fehlerhaft_Datum',
               'Fehlerhaft_Fahrleistung.x':'Fehlerhaft_Fahrleistung'}
col_names_y = {'Produktionsdatum.y':'Produktionsdatum', 
               'Herstellernummer.y':'Herstellernummer',	
               'Werksnummer.y':'Werksnummer',
               'Fehlerhaft.y':'Fehlerhaft', 
               'Fehlerhaft_Datum.y':'Fehlerhaft_Datum',
               'Fehlerhaft_Fahrleistung.y':'Fehlerhaft_Fahrleistung'}

# separate type B tables based on the suffixes, rename the columns, and 
# concatenate vertically, and finally extract the columns that contain the data
K2LE1_x = K2LE1[K2LE1['ID_Sitze.x'].notna()].rename(columns=col_names_x)
K2LE1_y = K2LE1[K2LE1['ID_Sitze.y'].notna()].rename(columns=col_names_y)
K2LE1_x = K2LE1_x.rename(columns={'ID_Sitze.x':'ID_Sitze'})
K2LE1_y = K2LE1_y.rename(columns={'ID_Sitze.y':'ID_Sitze'})
K2LE1   = pd.concat([K2LE1_x, K2LE1_y], axis=0)
K2LE1   = K2LE1.loc[:,'X1':'Fehlerhaft_Fahrleistung']

K3SG1_x = K3SG1[K3SG1['ID_Schaltung.x'].notna()].rename(columns=col_names_x)
K3SG1_y = K3SG1[K3SG1['ID_Schaltung.y'].notna()].rename(columns=col_names_y)
K3SG1_x = K3SG1_x.rename(columns={'ID_Schaltung.x':'ID_Schaltung'})
K3SG1_y = K3SG1_y.rename(columns={'ID_Schaltung.y':'ID_Schaltung'})
K3SG1   = pd.concat([K3SG1_x, K3SG1_y], axis=0)
K3SG1   = K3SG1.loc[:, 'X1':'Fehlerhaft_Fahrleistung']

# separate type B tables based on the suffixes, rename the columns, and 
# concatenate vertically, and finally extract the columns that contain the data
K1DI1_x = K1DI1[K1DI1['ID_Motor.x'].notna()].rename(columns=col_names_x)
K1DI1_y = K1DI1[K1DI1['ID_Motor.y'].notna()].rename(columns=col_names_y)
K1DI1_x = K1DI1_x.rename(columns={'ID_Motor.x':'ID_Motor'})
K1DI1_y = K1DI1_y.rename(columns={'ID_Motor.y':'ID_Motor'})
K1DI1   = pd.concat([K1DI1_x, K1DI1_y], axis=0)
K1DI1   = K1DI1.loc[:, 'X1':'Fehlerhaft_Fahrleistung']

KeyError: 'ID_Sitze.x'

In [108]:
#K1DI1[K1DI1['ID_Motor.x'].notna()]


Unnamed: 0,X1,ID_Sitze,Produktionsdatum,Herstellernummer,Werksnummer,Fehlerhaft,Fehlerhaft_Datum,Fehlerhaft_Fahrleistung
1,1,K2LE1-109-1091-2,2008-11-12,109.0,1091.0,1,2010-10-18,37080.0
2,2,K2LE1-109-1091-1,2008-11-12,109.0,1091.0,0,NaT,0.0
3,3,K2LE1-109-1091-12,2008-11-13,109.0,1091.0,0,NaT,0.0
4,4,K2LE1-109-1091-5,2008-11-13,109.0,1091.0,0,NaT,0.0
5,5,K2LE1-109-1091-40,2008-11-13,109.0,1091.0,0,NaT,0.0
...,...,...,...,...,...,...,...,...
477048,477048,K2LE1-111-1111-286228,2016-11-13,111.0,1111.0,0,NaT,0.0
477049,477049,K2LE1-111-1111-286231,2016-11-14,111.0,1111.0,0,NaT,0.0
477050,477050,K2LE1-111-1111-286232,2016-11-14,111.0,1111.0,1,2017-06-15,8641.0
477051,477051,K2LE1-111-1111-286229,2016-11-14,111.0,1111.0,0,NaT,0.0
