## Concatenating dataframes side-by-side and below each other

This file is based on Aki Taanila's TilastoApu, https://nbviewer.org/github/taanila/data/blob/main/suodata.ipynb

When combining two dataframes next to each other (side-by-side), both dataframes need to have index based on what the rows are aligned.

When combining two dataframes on top of each other, columns with the same name get aligned.


In [1]:
import pandas as pd

pd.options.display.max_rows = None

<h2> Concatenating side-by-side</h2>

In [2]:
# Let's open the datafiles into the Notebook

# Data that we have already seen in previous Notebook and for which the new data will be added (The amount of days absent from work - sick)
df1 = pd.read_excel('http://staff.hamk.fi/~tlahti/PythonForDataScience/WorkPlaceSatisfactionSurveyData.xlsx')

# Data that we use to add more information into the original data (This includes the amount of days that the person has been absent from work - sick)
df2 = pd.read_excel('http://staff.hamk.fi/~tlahti/PythonForDataScience/DaysAbsent.xlsx')


df1

Unnamed: 0,number,gender,age,family,education,years_of_service,salary,sat_management,sat_colleques,sat_workingEnvironment,sat_salary,sat_tasks,healtcare,holidayCabin,gym,muscleCare
0,1,1,38,1,1.0,22.0,3587,3,3.0,3,3,3,,,,
1,2,1,29,2,2.0,10.0,2963,1,5.0,2,1,3,,,,
2,3,1,30,1,1.0,7.0,1989,3,4.0,1,1,3,1.0,,,
3,4,1,36,2,1.0,14.0,2144,3,3.0,3,3,3,1.0,,,
4,5,1,24,1,2.0,4.0,2183,2,3.0,2,1,2,1.0,,,
5,6,2,31,2,2.0,14.0,1910,4,4.0,5,2,4,1.0,1.0,,
6,7,1,49,1,2.0,16.0,2066,3,5.0,4,2,2,,,1.0,
7,8,1,55,1,1.0,0.0,2066,3,5.0,3,1,3,1.0,,,
8,9,1,40,2,1.0,23.0,2768,2,4.0,4,2,4,,1.0,,
9,10,1,33,1,1.0,16.0,2106,3,2.0,1,1,1,1.0,,,


In [3]:
# The amount of days person has been absent - sick
# the numbers in "number" column corresponds with each other. Hence number 6 in df1 is the same person as in df2
df2

Unnamed: 0,number,absent
0,6,20
1,7,1
2,8,7
3,45,5
4,46,4
5,47,2
6,48,0
7,49,5
8,50,0


In [4]:
# Let's move the "number" column into index
df1 = df1.set_index('number')
df2 = df2.set_index('number')

In [5]:
# Let's verify that the change was successful for df1
df1.head()

Unnamed: 0_level_0,gender,age,family,education,years_of_service,salary,sat_management,sat_colleques,sat_workingEnvironment,sat_salary,sat_tasks,healtcare,holidayCabin,gym,muscleCare
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1,38,1,1.0,22.0,3587,3,3.0,3,3,3,,,,
2,1,29,2,2.0,10.0,2963,1,5.0,2,1,3,,,,
3,1,30,1,1.0,7.0,1989,3,4.0,1,1,3,1.0,,,
4,1,36,2,1.0,14.0,2144,3,3.0,3,3,3,1.0,,,
5,1,24,1,2.0,4.0,2183,2,3.0,2,1,2,1.0,,,


In [6]:
# Let's verify that the change was successful for df2
df2.head()

Unnamed: 0_level_0,absent
number,Unnamed: 1_level_1
6,20
7,1
8,7
45,5
46,4


In [7]:
# Next, we concatenate df1 and df2 side-by-side (axis=1 is used when we concatenate side-by-side
df3 = pd.concat([df1, df2], axis = 1)

In [8]:
# Checking for successful concatenation
df3

Unnamed: 0_level_0,gender,age,family,education,years_of_service,salary,sat_management,sat_colleques,sat_workingEnvironment,sat_salary,sat_tasks,healtcare,holidayCabin,gym,muscleCare,absent
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,1,38,1,1.0,22.0,3587,3,3.0,3,3,3,,,,,
2,1,29,2,2.0,10.0,2963,1,5.0,2,1,3,,,,,
3,1,30,1,1.0,7.0,1989,3,4.0,1,1,3,1.0,,,,
4,1,36,2,1.0,14.0,2144,3,3.0,3,3,3,1.0,,,,
5,1,24,1,2.0,4.0,2183,2,3.0,2,1,2,1.0,,,,
6,2,31,2,2.0,14.0,1910,4,4.0,5,2,4,1.0,1.0,,,20.0
7,1,49,1,2.0,16.0,2066,3,5.0,4,2,2,,,1.0,,1.0
8,1,55,1,1.0,0.0,2066,3,5.0,3,1,3,1.0,,,,7.0
9,1,40,2,1.0,23.0,2768,2,4.0,4,2,4,,1.0,,,
10,1,33,1,1.0,16.0,2106,3,2.0,1,1,1,1.0,,,,


<h2>Concatenating dataframes below each other</h2>

In [9]:
# WorkPlaceSatisfactionSurveyData_add.xlsx contains data that we want to put under df3
df4 = pd.read_excel('http://staff.hamk.fi/~tlahti/PythonForDataScience/WorkPlaceSatisfactionSurveyData_add.xlsx')

# Let's move the "number" column into index again
df4 = df4.set_index('number')
df4

Unnamed: 0_level_0,gender,age,family,education,years_of_service,salary,sat_management,sat_colleques,sat_workingEnvironment,palkkat,sat_tasks,healtCare,holidayCabin,gym,muscleCare
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
83,2,31,2,3,1,2800,4,4,4,3,3,1,,,1.0
84,1,26,1,2,0,2500,5,4,3,2,5,1,,,
85,1,32,2,2,0,2500,5,3,2,2,3,1,,1.0,
86,1,40,1,2,0,2500,3,4,3,2,3,1,,1.0,
87,1,31,1,4,1,3500,4,4,3,3,3,1,,1.0,
88,1,28,1,4,0,4600,4,3,2,4,2,1,,,
89,1,29,1,4,0,3900,4,3,3,4,3,1,,,
90,2,31,2,2,0,2400,4,3,2,2,3,1,,1.0,


In [10]:
# Now concatenating df3 and df4 below each other
df5 = pd.concat([df3, df4])
df5

Unnamed: 0_level_0,gender,age,family,education,years_of_service,salary,sat_management,sat_colleques,sat_workingEnvironment,sat_salary,sat_tasks,healtcare,holidayCabin,gym,muscleCare,absent,palkkat,healtCare
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1,38,1,1.0,22.0,3587,3,3.0,3,3.0,3,,,,,,,
2,1,29,2,2.0,10.0,2963,1,5.0,2,1.0,3,,,,,,,
3,1,30,1,1.0,7.0,1989,3,4.0,1,1.0,3,1.0,,,,,,
4,1,36,2,1.0,14.0,2144,3,3.0,3,3.0,3,1.0,,,,,,
5,1,24,1,2.0,4.0,2183,2,3.0,2,1.0,2,1.0,,,,,,
6,2,31,2,2.0,14.0,1910,4,4.0,5,2.0,4,1.0,1.0,,,20.0,,
7,1,49,1,2.0,16.0,2066,3,5.0,4,2.0,2,,,1.0,,1.0,,
8,1,55,1,1.0,0.0,2066,3,5.0,3,1.0,3,1.0,,,,7.0,,
9,1,40,2,1.0,23.0,2768,2,4.0,4,2.0,4,,1.0,,,,,
10,1,33,1,1.0,16.0,2106,3,2.0,1,1.0,1,1.0,,,,,,


<h2>More info</h2>

- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html
    
In Finnish, Data-analytiikka Pythonilla: https://tilastoapu.wordpress.com/python/