# Joining DataFrames in Pandas

In [1]:
import pandas as pd

In [2]:
# import the dataset

data = pd.read_csv('./Data/titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
titanic_name = data['Name']
titanic_age = data['Age']

# titanic_name , titanic_age are the two series objects 
type(titanic_name), type(titanic_age)

(pandas.core.series.Series, pandas.core.series.Series)

## `pd.concat()`

In [5]:
# let us concatenate these two series objects but along the axis = 0 i.e row-wise 
pd.concat([
    titanic_name,
    titanic_age
])

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                                 27.0
887                                                 19.0
888                                                  NaN
889                                                 26.0
890                                                 32.0
Length: 1782, dtype: object

In [6]:
# let us concatentate along the column

name_age = pd.concat([
    titanic_name,
    titanic_age
], axis=1)

name_age.head() # name_age is a df

Unnamed: 0,Name,Age
0,"Braund, Mr. Owen Harris",22.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
2,"Heikkinen, Miss. Laina",26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
4,"Allen, Mr. William Henry",35.0


In [7]:
# let us concat another series to this df
pd.concat([
    name_age,
    data['Sex']
], axis=1)

Unnamed: 0,Name,Age,Sex
0,"Braund, Mr. Owen Harris",22.0,male
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,female
2,"Heikkinen, Miss. Laina",26.0,female
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,female
4,"Allen, Mr. William Henry",35.0,male
...,...,...,...
886,"Montvila, Rev. Juozas",27.0,male
887,"Graham, Miss. Margaret Edith",19.0,female
888,"Johnston, Miss. Catherine Helen ""Carrie""",,female
889,"Behr, Mr. Karl Howell",26.0,male


Similarly, I can also concatenate two dataframes into one. 

**NOTE**
- Pandas
concatenation preserves indices, even if the result will have duplicate indices!
- If you’d like to simply verify that the indices in the result of pd.concat do not overlap,
you can include the `verify_integrity` flag. With this set to True, the concatenation
will raise an exception if there are duplicate indices.

In [None]:
name_age_reduced = pd.concat([
    titanic_name[:5],
    titanic_age[:5]
])

name_age_reduced # concatenation preserves indices

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
0                                                 22.0
1                                                 38.0
2                                                 26.0
3                                                 35.0
4                                                 35.0
dtype: object

- Ignoring the index during concatenation

In [8]:
store_a = pd.DataFrame({
    'Product': ['Pen', 'Pencil'],
    'Sales': [100, 150]
})

store_b = pd.DataFrame({
    'Product': ['Notebook', 'Eraser'],
    'Sales': [200, 80]
})

In [12]:
pd.concat([store_a, store_b], axis=0, ignore_index=True)

Unnamed: 0,Product,Sales
0,Pen,100
1,Pencil,150
2,Notebook,200
3,Eraser,80


- Using keys for Hierarchical Indexing

In [11]:
pd.concat([store_a, store_b], keys=['Store A', 'Store B'])

Unnamed: 0,Unnamed: 1,Product,Sales
Store A,0,Pen,100
Store A,1,Pencil,150
Store B,0,Notebook,200
Store B,1,Eraser,80


- Inner Join on Columns

In [13]:
store_c = pd.DataFrame({
    'Product': ['Marker'],
    'Discount': [10]
})

pd.concat([store_a, store_c], join='inner', ignore_index=True)

Unnamed: 0,Product
0,Pen
1,Pencil
2,Marker


## `df.set_index()`

In [14]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Score': [85, 90, 95]
})
df.set_index('Name')

Unnamed: 0_level_0,Score
Name,Unnamed: 1_level_1
Alice,85
Bob,90
Charlie,95


⚠️ Important: this does not change the original DataFrame unless you assign it back!

In [15]:
# Set Index Without Dropping the Column
df.set_index("Name", drop=False)

Unnamed: 0_level_0,Name,Score
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,Alice,85
Bob,Bob,90
Charlie,Charlie,95


In [16]:
# create a Multiindex 

df = pd.DataFrame({
    'Class': ['A', 'A', 'B', 'B'],
    'Student': ['Alice', 'Bob', 'Charlie', 'David'],
    'Marks': [90, 85, 88, 92]
})

df.set_index(['Class', 'Student'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Marks
Class,Student,Unnamed: 2_level_1
A,Alice,90
A,Bob,85
B,Charlie,88
B,David,92


In [17]:
# Detect duplicate Index 

df = pd.DataFrame({
    'ID': [1, 2, 2],
    'Value': ['A', 'B', 'C']
})

df.set_index('ID', verify_integrity=True)

ValueError: Index has duplicate keys: Index([2], dtype='int64', name='ID')

## `df.join()`