In [None]:
import pandas as pd

In [None]:
df_movies = pd.read_csv('IMDb movies.csv',low_memory=False)
df_ratings = pd.read_csv('IMDb ratings.csv')

In [None]:
# select columns
df_movies = df_movies[['imdb_title_id', 'title', 'year',
                       'genre', 'country', 'director', 'actors']]

df_ratings = df_ratings[['imdb_title_id', 'total_votes', 'mean_vote']]

# concat()

## Concatenate vertically

In [11]:
df1 = pd.DataFrame({'id': ['A', 'B', 'C', 'D'],
                    'age': [30, 23, 25, 22]})
df2 = pd.DataFrame({'id': ['E', 'F', 'G', 'F'],
                    'age': [40, 21, 19, 24]})

pd.concat([df1, df2],ignore_index=True)

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22
4,E,40
5,F,21
6,G,19
7,F,24


In [14]:
# 50% sample
df_sample = df_movies.sample(frac=0.5)
print(df_movies.shape)
print(df_sample.shape)

(85855, 7)
(42928, 7)


In [16]:
df_concat_vertically = pd.concat([df_sample, df_movies], axis=0)
df_concat_vertically.head(5)

Unnamed: 0,imdb_title_id,title,year,genre,country,director,actors
70175,tt3173910,Hasee Toh Phasee,2014,"Comedy, Romance",India,Vinil Mathew,"Sidharth Malhotra, Parineeti Chopra, Adah Shar..."
28454,tt0111168,Sherlock: Undercover Dog,1994,"Adventure, Comedy, Crime",USA,Richard Harding Gardner,"Benjamin Eroen, Anthony Simmons, Cooper Camero..."
60290,tt1640202,Scene di un'estate,2010,Comedy,Turkey,Ozan Açiktan,"Gülsüm Alkan, Ibrahim Büyükak, Emre Canpolat, ..."
155,tt0010193,The Greatest Question,1919,Drama,USA,D.W. Griffith,"Lillian Gish, Robert Harron, Ralph Graves, Eug..."
42427,tt0332381,Hafið,2002,"Comedy, Drama","Iceland, France, Norway",Baltasar Kormákur,"Gunnar Eyjólfsson, Hilmir Snær Guðnason, Hélèn..."


## Concatenate horizontally

In [17]:
df1 = pd.DataFrame({'id': ['A', 'B', 'C', 'D'],
                    'age': [30, 23, 25, 22]})
df2 = pd.DataFrame({'job': ['Doctor', 'Statistician',
                            'Accountant', 'Developer']})

pd.concat([df1, df2],axis=1)

Unnamed: 0,id,age,job
0,A,30,Doctor
1,B,23,Statistician
2,C,25,Accountant
3,D,22,Developer


In [21]:
# concatenate df_movies and df_ratings on 'imdb_title_id' (horizontally along the columns)
df_concat_horizontally = pd.concat([
    df_movies.set_index('imdb_title_id'),df_ratings.set_index('imdb_title_id'),
],axis=1)

df_concat_horizontally.head()


Unnamed: 0_level_0,title,year,genre,country,director,actors,total_votes,mean_vote
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt0000009,Miss Jerry,1894,Romance,USA,Alexander Black,"Blanche Bayliss, William Courtenay, Chauncey D...",154,5.9
tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia,Charles Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",589,6.3
tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark",Urban Gad,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",188,6.0
tt0002101,Cleopatra,1912,"Drama, History",USA,Charles L. Gaskill,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",446,5.3
tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy,"Francesco Bertolini, Adolfo Padovan","Salvatore Papa, Arturo Pirovano, Giuseppe de L...",2237,6.9


# merge()

In [23]:
df1 = pd.DataFrame({'id': ['A', 'B', 'C', 'D'],
                    'age': [30, 23, 25, 22]})
df2 = pd.DataFrame({'id': ['C', 'D', 'E', 'F'],
                    'job': ['Doctor', 'Statistician',
                            'Accountant', 'Developer']})

## Inner Join

In [24]:
df1.merge(df2,on='id',how='inner')

Unnamed: 0,id,age,job
0,C,25,Doctor
1,D,22,Statistician


In [28]:
df_movies.merge(df_ratings,on='imdb_title_id')

Unnamed: 0,imdb_title_id,title,year,genre,country,director,actors,total_votes,mean_vote
0,tt0000009,Miss Jerry,1894,Romance,USA,Alexander Black,"Blanche Bayliss, William Courtenay, Chauncey D...",154,5.9
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia,Charles Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",589,6.3
2,tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark",Urban Gad,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",188,6.0
3,tt0002101,Cleopatra,1912,"Drama, History",USA,Charles L. Gaskill,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",446,5.3
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy,"Francesco Bertolini, Adolfo Padovan","Salvatore Papa, Arturo Pirovano, Giuseppe de L...",2237,6.9
...,...,...,...,...,...,...,...,...,...
85850,tt9908390,Le lion,2020,Comedy,"France, Belgium",Ludovic Colbeau-Justin,"Dany Boon, Philippe Katerine, Anne Serra, Samu...",398,5.5
85851,tt9911196,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",Netherlands,Johan Nijenhuis,"Herman Finkers, Johanna ter Steege, Leonie ter...",724,7.9
85852,tt9911774,Padmavyuhathile Abhimanyu,2019,Drama,India,Vineesh Aaradya,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",265,7.8
85853,tt9914286,Sokagin Çocuklari,2019,"Drama, Family",Turkey,Ahmet Faik Akinci,"Ahmet Faik Akinci, Belma Mamati, Metin Keçeci,...",194,9.4


## Full Join

In [29]:
df1.merge(df2,on='id',how='outer')

Unnamed: 0,id,age,job
0,A,30.0,
1,B,23.0,
2,C,25.0,Doctor
3,D,22.0,Statistician
4,E,,Accountant
5,F,,Developer


In [30]:
df_movies.merge(df_ratings, on='imdb_title_id', how='outer')

Unnamed: 0,imdb_title_id,title,year,genre,country,director,actors,total_votes,mean_vote
0,tt0000009,Miss Jerry,1894,Romance,USA,Alexander Black,"Blanche Bayliss, William Courtenay, Chauncey D...",154,5.9
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia,Charles Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",589,6.3
2,tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark",Urban Gad,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",188,6.0
3,tt0002101,Cleopatra,1912,"Drama, History",USA,Charles L. Gaskill,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",446,5.3
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy,"Francesco Bertolini, Adolfo Padovan","Salvatore Papa, Arturo Pirovano, Giuseppe de L...",2237,6.9
...,...,...,...,...,...,...,...,...,...
85850,tt9908390,Le lion,2020,Comedy,"France, Belgium",Ludovic Colbeau-Justin,"Dany Boon, Philippe Katerine, Anne Serra, Samu...",398,5.5
85851,tt9911196,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",Netherlands,Johan Nijenhuis,"Herman Finkers, Johanna ter Steege, Leonie ter...",724,7.9
85852,tt9911774,Padmavyuhathile Abhimanyu,2019,Drama,India,Vineesh Aaradya,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",265,7.8
85853,tt9914286,Sokagin Çocuklari,2019,"Drama, Family",Turkey,Ahmet Faik Akinci,"Ahmet Faik Akinci, Belma Mamati, Metin Keçeci,...",194,9.4


## Exclusive Outer Join

In [31]:

# indicator=True
df1.merge(df2, on='id', how='outer',
          indicator=True)

Unnamed: 0,id,age,job,_merge
0,A,30.0,,left_only
1,B,23.0,,left_only
2,C,25.0,Doctor,both
3,D,22.0,Statistician,both
4,E,,Accountant,right_only
5,F,,Developer,right_only


In [32]:
# query
df1.merge(df2, on='id', how='outer',
          indicator=True).query("_merge=='left_only' or _merge=='right_only'")

Unnamed: 0,id,age,job,_merge
0,A,30.0,,left_only
1,B,23.0,,left_only
4,E,,Accountant,right_only
5,F,,Developer,right_only


In [33]:
df_movies.merge(df_ratings, on='imdb_title_id', how='outer',indicator=True).query("_merge=='left_only' or _merge=='right_only'")

Unnamed: 0,imdb_title_id,title,year,genre,country,director,actors,total_votes,mean_vote,_merge


## Left Join

In [34]:
df1.merge(df2, on='id', how='left')

Unnamed: 0,id,age,job
0,A,30,
1,B,23,
2,C,25,Doctor
3,D,22,Statistician


In [35]:
df1.merge(df2, on='id', how='left',indicator=True).query("_merge=='left_only'")

Unnamed: 0,id,age,job,_merge
0,A,30,,left_only
1,B,23,,left_only


In [37]:
# make a copy of the df_movies dataframe
df_movies_2 = df_movies.copy()

# set the first 1000 values of 'imdb_title_id' column as 'tt1234567890'
for index in df_movies_2.index:
    if index < 1000:
        df_movies_2.loc[index, 'imdb_title_id'] = 'tt1234567890'

# merge df_movies_2 and df_ratings (exclusive left join)
df_exclusive_left = df_movies_2.merge(df_ratings,
                                      on='imdb_title_id',
                                      how='outer',
                                      indicator=True).query("_merge=='left_only'")

df_exclusive_left

Unnamed: 0,imdb_title_id,title,year,genre,country,director,actors,total_votes,mean_vote,_merge
55817,tt1234567890,Miss Jerry,1894,Romance,USA,Alexander Black,"Blanche Bayliss, William Courtenay, Chauncey D...",,,left_only
55818,tt1234567890,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia,Charles Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",,,left_only
55819,tt1234567890,Den sorte drøm,1911,Drama,"Germany, Denmark",Urban Gad,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",,,left_only
55820,tt1234567890,Cleopatra,1912,"Drama, History",USA,Charles L. Gaskill,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",,,left_only
55821,tt1234567890,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy,"Francesco Bertolini, Adolfo Padovan","Salvatore Papa, Arturo Pirovano, Giuseppe de L...",,,left_only
...,...,...,...,...,...,...,...,...,...,...
56812,tt1234567890,Agente segreto Z1,1930,Drama,USA,Roy Del Ruth,"Constance Bennett, Erich von Stroheim, Anthony...",,,left_only
56813,tt1234567890,Tom Sawyer,1930,"Adventure, Comedy, Drama",USA,John Cromwell,"Jackie Coogan, Junior Durkin, Mitzi Green, Luc...",,,left_only
56814,tt1234567890,Tonka Sibenice,1930,Drama,"Czechoslovakia, Germany",Karl Anton,"Ita Rina, Vera Baranovskaya, Josef Rovenský, A...",,,left_only
56815,tt1234567890,Top Speed,1930,"Comedy, Musical, Romance",USA,Mervyn LeRoy,"Joe E. Brown, Bernice Claire, Jack Whiting, Fr...",,,left_only


## Right Join

In [38]:
df1.merge(df2, on='id', how='right')

Unnamed: 0,id,age,job
0,C,25.0,Doctor
1,D,22.0,Statistician
2,E,,Accountant
3,F,,Developer


In [39]:
df1.merge(df2, on='id', how='right',indicator=True).query("_merge=='right_only'")

Unnamed: 0,id,age,job,_merge
2,E,,Accountant,right_only
3,F,,Developer,right_only
