In [2]:
import numpy as np
import pandas as pd

data = {
    "Employee": 
        ["Klaus Poppe", "Franz Bonaparta", np.nan, "Jakub Farobek", "Emil Šébe", "Helmuth Voss", "Josef Bäumler", "Klaus Poppe", "Franz Bonaparta", "Jakub Farobek", "Emil Šébe", "Helmuth Voss", "Josef Bäumler"],
    "Salary": 
        [1000, None, 320, 450, 420, np.nan, np.nan, 1000, None, 450, 420, np.nan, np.nan],
    "Performance%": 
        [98, None, 50, 10, 43, 2, 67, 98, None, 10, np.nan, np.nan, 13],
    "Year": 
        [2023, 2023, 2023, 2023, 2023, 2023, 2023, 2024, 2024, 2024, 2024, 2024, 2024]
}

df = pd.DataFrame(data=data)
# df["Employee"] = df["Employee"].dropna(axis=0, inplace=False) # Doesn't work 
df.dropna(subset=["Employee"], axis=0, inplace=True) # But this does

df.fillna({'Salary': 0.0, 'Performance%': 0.0}, inplace=True)
df = df.astype({'Salary': 'float32', 'Performance%': 'int32'})

df.head()

Unnamed: 0,Employee,Salary,Performance%,Year
0,Klaus Poppe,1000.0,98,2023
1,Franz Bonaparta,0.0,0,2023
3,Jakub Farobek,450.0,10,2023
4,Emil Šébe,420.0,43,2023
5,Helmuth Voss,0.0,2,2023


In [9]:
employee_series = df["Employee"]

# Testing
# for _ in map(lambda item: print(item), employee_set):
#     pass

for i in map(lambda item: str(item).split(" "), employee_series):
    print(i)

['Klaus', 'Poppe']
['Franz', 'Bonaparta']
['Jakub', 'Farobek']
['Emil', 'Šébe']
['Helmuth', 'Voss']
['Josef', 'Bäumler']
['Klaus', 'Poppe']
['Franz', 'Bonaparta']
['Jakub', 'Farobek']
['Emil', 'Šébe']
['Helmuth', 'Voss']
['Josef', 'Bäumler']


### V1 and V2

In [10]:
# V1
employee_series = df["Employee"]
first_names = []
second_names = []
for i in map(lambda item: str(item).split(" "), employee_series):
    first_names.append(i[0])
    second_names.append(i[1])

print(first_names)
print(second_names)

# V2 (likely to be faster than V1 due to it's vectorized operations and efficient memory handling)
# employee_series = df["Employee"] # already defined
first_names = pd.Series()
second_names = pd.Series()
for i in map(lambda item: str(item).split(" "), employee_series):
    first_names.loc[len(first_names)] = i[0]
    second_names.loc[len(second_names)] = i[1]

print(first_names.values)
print(second_names.values)

['Klaus', 'Franz', 'Jakub', 'Emil', 'Helmuth', 'Josef', 'Klaus', 'Franz', 'Jakub', 'Emil', 'Helmuth', 'Josef']
['Poppe', 'Bonaparta', 'Farobek', 'Šébe', 'Voss', 'Bäumler', 'Poppe', 'Bonaparta', 'Farobek', 'Šébe', 'Voss', 'Bäumler']
['Klaus' 'Franz' 'Jakub' 'Emil' 'Helmuth' 'Josef' 'Klaus' 'Franz' 'Jakub'
 'Emil' 'Helmuth' 'Josef']
['Poppe' 'Bonaparta' 'Farobek' 'Šébe' 'Voss' 'Bäumler' 'Poppe' 'Bonaparta'
 'Farobek' 'Šébe' 'Voss' 'Bäumler']


### V3

In [11]:
# V3
employee_series = df["Employee"]
name_surname = np.array([["Name", "Surname"]])
for i in map(lambda item: str(item).split(" "), employee_series):
    name_surname = np.concatenate((name_surname, [[i[0], i[1]]]))
    # print(np.array(i))

print(name_surname)

[['Name' 'Surname']
 ['Klaus' 'Poppe']
 ['Franz' 'Bonaparta']
 ['Jakub' 'Farobek']
 ['Emil' 'Šébe']
 ['Helmuth' 'Voss']
 ['Josef' 'Bäumler']
 ['Klaus' 'Poppe']
 ['Franz' 'Bonaparta']
 ['Jakub' 'Farobek']
 ['Emil' 'Šébe']
 ['Helmuth' 'Voss']
 ['Josef' 'Bäumler']]


### V4.1

In [3]:
employee_series = df["Employee"]
employee_series.str.split(" ", expand=False) # Returns a pandas Series of arrays
# `expand=True` -> The resulting Series is transformed into a DataFrame with separate columns for each split value.
# employee_series.str.split(" ", expand=True).rename(columns={0:"Name", 1:"Surname"})

0         [Klaus, Poppe]
1     [Franz, Bonaparta]
3       [Jakub, Farobek]
4           [Emil, Šébe]
5        [Helmuth, Voss]
6       [Josef, Bäumler]
7         [Klaus, Poppe]
8     [Franz, Bonaparta]
9       [Jakub, Farobek]
10          [Emil, Šébe]
11       [Helmuth, Voss]
12      [Josef, Bäumler]
Name: Employee, dtype: object

In [13]:
# V4.1
employee_series = df["Employee"]

names_array = np.array(employee_series.str.split(" ", expand=True))
first_names = names_array[:, 0]
last_names = names_array[:, 1]

print(names_array)
print("-"*30)
print(names_array.shape)
print(first_names)
print(last_names)

[['Klaus' 'Poppe']
 ['Franz' 'Bonaparta']
 ['Jakub' 'Farobek']
 ['Emil' 'Šébe']
 ['Helmuth' 'Voss']
 ['Josef' 'Bäumler']
 ['Klaus' 'Poppe']
 ['Franz' 'Bonaparta']
 ['Jakub' 'Farobek']
 ['Emil' 'Šébe']
 ['Helmuth' 'Voss']
 ['Josef' 'Bäumler']]
------------------------------
(12, 2)
['Klaus' 'Franz' 'Jakub' 'Emil' 'Helmuth' 'Josef' 'Klaus' 'Franz' 'Jakub'
 'Emil' 'Helmuth' 'Josef']
['Poppe' 'Bonaparta' 'Farobek' 'Šébe' 'Voss' 'Bäumler' 'Poppe' 'Bonaparta'
 'Farobek' 'Šébe' 'Voss' 'Bäumler']


### V4.2 - Splitting Columns 

In [14]:
df_copy = df.copy()

# Not necessary since a new DataFrame copy is being assigned to df_copy in the line above.
# df_copy.drop(columns=["Name", "Surname"], inplace=True)

employee_series = df_copy["Employee"]
temp_df = employee_series.str.split(" ", expand=True)
df_copy.insert(loc=0, column="Name", value=temp_df.iloc[:, 0])
df_copy.insert(loc=1, column="Surname", value=temp_df.iloc[:, 1])
df_copy.drop(columns=["Employee"], inplace=True)
df_copy

Unnamed: 0,Name,Surname,Salary,Performance%,Year
0,Klaus,Poppe,1000.0,98,2023
1,Franz,Bonaparta,0.0,0,2023
3,Jakub,Farobek,450.0,10,2023
4,Emil,Šébe,420.0,43,2023
5,Helmuth,Voss,0.0,2,2023
6,Josef,Bäumler,0.0,67,2023
7,Klaus,Poppe,1000.0,98,2024
8,Franz,Bonaparta,0.0,0,2024
9,Jakub,Farobek,450.0,10,2024
10,Emil,Šébe,420.0,0,2024


In [17]:
# Removing duplicate based on the column Name (culminated in data loss for the year of 2024)
df_copy.drop_duplicates(subset=["Name"], inplace=False)
# This doesn't culminate in data loss because the column Year is also taken into account.
df_copy.drop_duplicates(subset=["Name", "Year"], inplace=False)

Unnamed: 0,Name,Surname,Salary,Performance%,Year
0,Klaus,Poppe,1000.0,98,2023
1,Franz,Bonaparta,0.0,0,2023
3,Jakub,Farobek,450.0,10,2023
4,Emil,Šébe,420.0,43,2023
5,Helmuth,Voss,0.0,2,2023
6,Josef,Bäumler,0.0,67,2023
7,Klaus,Poppe,1000.0,98,2024
8,Franz,Bonaparta,0.0,0,2024
9,Jakub,Farobek,450.0,10,2024
10,Emil,Šébe,420.0,0,2024
