## Objective: Get lists from two pandas dataframe column and find out which columns are same and not.

------
Reference: https://note.nkmk.me/python-list-common/

In [9]:
# Import library
import pandas as pd
from IPython.display import display

# Create a dictionary of list
dictionary_of_lists = {
    "FilmID": [1, 2, 3, 4],
    "FilmName": ["Iron Man", "The Incredible Hulk", "Iron Man 2","Thor"],
    "LeadingActor": ["Robert Downey Jr.", "Edward Norton", "Robert Downey Jr.", "Chris Hemsworth"], 
    "Rating": [7.9, 6.6, 6.9, 7]}
# Create the DataFrame
df_a = pd.DataFrame(dictionary_of_lists)

# Create a dictionary of list with different column name
dictionary_of_lists = {
    "FilmID": [5, 6, 7, 8],
    "FilmName": ["Captain America: The First Avenger", "The Avengers", "Iron Man 3","Thor: The Dark World"],
    "LeadingActor": ["Chris Evans", "Robert Downey Jr.", "Robert Downey Jr.", "Chris Hemsworth"],  
    "Star": [6.9, 8, 7.1, 6.8]}

df_b = pd.DataFrame(dictionary_of_lists)

# Create a dictionary of list with different column name
dictionary_of_lists = {
    "FilmID": [9, 10, 11, 12],
    "FilmName": ["Captain America: The Winter Soldier", "Guardians of the Galaxy", "Avengers: Age of Ultron","Ant-Man"],
    "LeadingActor": ["Chris Evans", "Chris Pratt", "Robert Downey Jr.", "Paul Rudd"],  
    "Point": [7.8, 8, 7.3, 7.3]}

df_c = pd.DataFrame(dictionary_of_lists)

display(df_a)
display(df_b)
display(df_c)

Unnamed: 0,FilmID,FilmName,LeadingActor,Rating
0,1,Iron Man,Robert Downey Jr.,7.9
1,2,The Incredible Hulk,Edward Norton,6.6
2,3,Iron Man 2,Robert Downey Jr.,6.9
3,4,Thor,Chris Hemsworth,7.0


Unnamed: 0,FilmID,FilmName,LeadingActor,Star
0,5,Captain America: The First Avenger,Chris Evans,6.9
1,6,The Avengers,Robert Downey Jr.,8.0
2,7,Iron Man 3,Robert Downey Jr.,7.1
3,8,Thor: The Dark World,Chris Hemsworth,6.8


Unnamed: 0,FilmID,FilmName,LeadingActor,Point
0,5,Captain America: The First Avenger,Chris Evans,6.9
1,6,The Avengers,Robert Downey Jr.,8.0
2,7,Iron Man 3,Robert Downey Jr.,7.1
3,8,Thor: The Dark World,Chris Hemsworth,6.8


In [11]:
# either "DataFrame.columns.values.tolist()" or "DataFrame.columns.tolist()" works 
# but the former is faster
df_a_col = df_a.columns.values.tolist()
df_b_col = df_b.columns.values.tolist()
df_c_col = df_c.columns.values.tolist()
print(df_a_col)
print(df_b_col)
print(df_c_col)

['FilmID', 'FilmName', 'LeadingActor', 'Rating']
['FilmID', 'FilmName', 'LeadingActor', 'Star']
['FilmID', 'FilmName', 'LeadingActor', 'Point']


Set class returns unique values from iterable
Set and "^" returns a non-duplicate value among two list

In [15]:
print(set(df_a_col))
print(set(df_b_col))
print(set(df_c_col))

{'Rating', 'FilmName', 'FilmID', 'LeadingActor'}
{'Star', 'FilmName', 'FilmID', 'LeadingActor'}
{'FilmName', 'Point', 'FilmID', 'LeadingActor'}


In [16]:
# retuns non-duplicate value from list
diff_of_ab = set(df_a_col) ^ set(df_a_col)
print(diff)

{'Star', 'Rating'}


If you want to returns unique value from mutiple list

In [22]:
#other way to do
all_column_of_abc = df_a_col + df_b_col + df_c_col
print(all_column_of_abc)

print(set(all_column_of_abc))

diff_of_abc = [x for x in set(all_column_of_abc) if all_column_of_abc.count(x) == 1]
print(diff_of_abc)

['FilmID', 'FilmName', 'LeadingActor', 'Rating', 'FilmID', 'FilmName', 'LeadingActor', 'Star', 'FilmID', 'FilmName', 'LeadingActor', 'Point']
{'Star', 'FilmName', 'FilmID', 'Point', 'LeadingActor', 'Rating'}
['Star', 'Point', 'Rating']


### It does work even when you want to investigate what columns were added to previous dataframe.

------

In [29]:
display(df_a)

df_b_new = df_b.drop(columns="Star")
display(df_b_new)

Unnamed: 0,FilmID,FilmName,LeadingActor,Rating
0,1,Iron Man,Robert Downey Jr.,7.9
1,2,The Incredible Hulk,Edward Norton,6.6
2,3,Iron Man 2,Robert Downey Jr.,6.9
3,4,Thor,Chris Hemsworth,7.0


Unnamed: 0,FilmID,FilmName,LeadingActor
0,5,Captain America: The First Avenger,Chris Evans
1,6,The Avengers,Robert Downey Jr.
2,7,Iron Man 3,Robert Downey Jr.
3,8,Thor: The Dark World,Chris Hemsworth


In [30]:
# either "DataFrame.columns.values.tolist()" or "DataFrame.columns.tolist()" works 
# but the former is faster
df_a_col = df_a.columns.values.tolist()
df_b_new_col = df_b_new.columns.values.tolist()
print(df_a_col)
print(df_b_new_col)

['FilmID', 'FilmName', 'LeadingActor', 'Rating']
['FilmID', 'FilmName', 'LeadingActor']


In [31]:
diff_of_ab = set(df_a_col) ^ set(df_b_new_col)
print(diff_of_ab)

{'Rating'}


### You can also find out common column name

------

In [32]:
display(df_a)
display(df_b)

Unnamed: 0,FilmID,FilmName,LeadingActor,Rating
0,1,Iron Man,Robert Downey Jr.,7.9
1,2,The Incredible Hulk,Edward Norton,6.6
2,3,Iron Man 2,Robert Downey Jr.,6.9
3,4,Thor,Chris Hemsworth,7.0


Unnamed: 0,FilmID,FilmName,LeadingActor,Star
0,5,Captain America: The First Avenger,Chris Evans,6.9
1,6,The Avengers,Robert Downey Jr.,8.0
2,7,Iron Man 3,Robert Downey Jr.,7.1
3,8,Thor: The Dark World,Chris Hemsworth,6.8


In [34]:
#returns common column value from two list
df_a_col = df_a.columns.values.tolist()
df_b_col = df_b.columns.values.tolist()
common = set(df_a_col) & set(df_b_col)
display(common)

{'FilmID', 'FilmName', 'LeadingActor'}

### You can compare it by looking at columns name side by side

------
Reference:https://note.nkmk.me/python-pandas-dataframe-series-conversion/

In [35]:
display(df_a)
display(df_b)

Unnamed: 0,FilmID,FilmName,LeadingActor,Rating
0,1,Iron Man,Robert Downey Jr.,7.9
1,2,The Incredible Hulk,Edward Norton,6.6
2,3,Iron Man 2,Robert Downey Jr.,6.9
3,4,Thor,Chris Hemsworth,7.0


Unnamed: 0,FilmID,FilmName,LeadingActor,Star
0,5,Captain America: The First Avenger,Chris Evans,6.9
1,6,The Avengers,Robert Downey Jr.,8.0
2,7,Iron Man 3,Robert Downey Jr.,7.1
3,8,Thor: The Dark World,Chris Hemsworth,6.8


In [36]:
#shows column name comparison
df_a_col = df_a.columns.values.tolist()
df_b_col = df_b.columns.values.tolist()
s1 = pd.Series(df_a_col, name='X')
s2 = pd.Series(df_b_col, name='Y')
compare_col = pd.concat([s1, s2], axis=1)
display(compare_col)

Unnamed: 0,X,Y
0,FilmID,FilmID
1,FilmName,FilmName
2,LeadingActor,LeadingActor
3,Rating,Star
