## Weitere Funktionen zur Datenverarbeitung

* Autorin: Anna (i3-Versicherung)
* Webseite: [Data Science Training - Kapitel 1](https://data-science.training/kapitel-1/)
* Datum: 23.03.2023

In diesem zweiten Notebook beschäftigen wir uns mit Funktionen der Daten(vor)verarbeitung.<br>
Wir orientieren uns dabei am KNIME Workflow und "bilden" die verwendeten Knoten in Python nach.
Die Namen der KNIME-Knoten sind angegeben.

(A) Datensätze (= Zeilen) filtern, aufteilen, sortieren

(B) Attribute (= Spalten) filtern, bearbeiten, sortieren

In [4]:
# Pandas Paket (Package) importieren
#  Datenstrukturen und Datenanalyse, I/O
#  https://pandas.pydata.org/pandas-docs/stable/
import pandas as pd

In [5]:
# Trainingsdaten als Pandas Data Frame (df) aus CSV-Datei laden
#  (KNIME: "CSV Reader")
df = pd.read_csv('../../data/titanic/original/train.csv')

In [6]:
# Attributnamen (Spaltenüberschriften) ausgeben
print(df.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [7]:
# Attributwerte in zwei Data Frames aufteilen
#  (KNIME: "Row Spliter")
#  hier z.B. Frauen vs. Männer
df1 = df[df['Sex'] == 'female']
df2 = df[df['Sex'] != 'female'] # entspricht == 'male'
# Tabellen anzeigen (als Test)
display(df1)
display(df2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [8]:
# Datensätze (Zeilen) mit query()-Funktion filtern
#  (KNIME: Rule-based Row Filter")
df1 = df1.query('Embarked == "C" or (Age >= 20 and Age <= 30)')
# Tabelle anzeigen (als Test)
display(df1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28.0,1,0,P/PP 3381,24.0000,,C
875,876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,,C
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S


In [9]:
# Zeilen sortieren
#  (KNIME: "Sorter")
#  z.B. nach dem Alter aufsteigend (engl. ascending)
#
df1 = df1.sort_values(by=['Age']) # default: ascending
#df1 = df1.sort_values(by=['Age'], ascending=True) # aufsteigend 
#df1 = df1.sort_values(by=['Age'], ascending=False) # absteigend
#df1 = df1.sort_values(by=['Age', 'Fare'], ascending=True) # mehrere Kriterien
#
# Tabelle anzeigen (als Test)
display(df1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
644,645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C
469,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C
381,382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1.00,0,2,2653,15.7417,,C
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.00,1,2,SC/Paris 2123,41.5792,,C
691,692,1,3,"Karun, Miss. Manca",female,4.00,0,1,349256,13.4167,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
367,368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C
375,376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C
533,534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C
578,579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C


In [10]:
# Eine neue Spalte durch eine mathematische Berechnung erzeugen
# und zum Data Frame df2 hinzufügen: 4 Alternativen (a bis d) 
#  (KNIME: "Math Formula")
#
sibspparch1 = df2['SibSp'] + df2['Parch'] + 1
#
# (a) direkt mit []-Operator (ggf. Copy-Warnung!)
#df2['SibSpParch1'] = sibspparch1
#
# (b) per Insert-Funktion
df2.insert(loc=len(df2.columns), column='SibSpParch1', value=sibspparch1)
#
# (c) per Assign-Funktion
#df2 = df2.assign(SibSpParch1 = sibspparch1)
#
# (d) per Loc-Funktion (ggf. Copy-Warnung!)
#df2.loc[:,'SibSpParch1'] = sibspparch1
#
# Tabelle anzeigen (als Test)
display(df2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,SibSpParch1
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S,1
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,1
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1


In [11]:
# Spaltennamen umbenennen
#  (KNIME: "Column Renamer")
df2 = df2.rename(columns={'SibSpParch1': 'FamilySize'}) # Dictionary
# Tabelle anzeigen (als Test)
display(df2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S,1
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,1
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1


In [12]:
# Spalten umsortieren
#  (KNIME: "Column Resorter")
#  Hierzu benutzen wir eine manuelle Liste mit den Spaltennamen.
#  Die Reihenfolge der Spaltenname in der Liste bestimmt dann die Reihenfolge der Spalten im Data Frame.
#
# Ursprüngliche Reihenfolge der Spaltennamen anzeigen
print(df2.columns.values)
# Neue Reihenfolge festlegen
new_columns = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'FamilySize', 'Ticket',
               'Fare', 'Embarked']
# Neue Reihenfolge anwenden
df2 = df2[new_columns]
# Tabelle anzeigen (als Test)
display(df2)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked' 'FamilySize']


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,FamilySize,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,2,A/5 21171,7.2500,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,1,373450,8.0500,S
5,6,0,3,"Moran, Mr. James",male,,0,0,1,330877,8.4583,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,1,17463,51.8625,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,5,349909,21.0750,S
...,...,...,...,...,...,...,...,...,...,...,...,...
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,1,C.A./SOTON 34068,10.5000,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,1,SOTON/OQ 392076,7.0500,S
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,1,211536,13.0000,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,1,111369,30.0000,C
