In [1]:
# importing Libraries
import numpy as np
import pandas as pd
from io import StringIO
import requests
import warnings

In [2]:
# Function to directly download dataset
def dataset(link):
    warnings.filterwarnings('ignore', message='Unverified HTTPS request')
    response = requests.get(link, verify=False)
    if response.status_code == 200:
        return StringIO(response.text)

In [3]:
movies = pd.read_csv(dataset('https://drive.google.com/uc?export=download&id=1kar9nuLrEThYW_0gGGu36ie8jw-BtbwW'))


In [4]:
movies.head()

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,The Shawshank Redemption,1994,142,Drama,9.3,Frank Darabont,Tim Robbins,2343110,28341469.0,80.0
1,The Godfather,1972,175,Crime,9.2,Francis Ford Coppola,Marlon Brando,1620367,134966411.0,100.0
2,The Dark Knight,2008,152,Action,9.0,Christopher Nolan,Christian Bale,2303232,534858444.0,84.0
3,The Godfather: Part II,1974,202,Crime,9.0,Francis Ford Coppola,Al Pacino,1129952,57300000.0,90.0
4,12 Angry Men,1957,96,Crime,9.0,Sidney Lumet,Henry Fonda,689845,4360000.0,96.0


### Groupby is used to group the data based on a column in dataframe. groupby is always applied on Categorical Data.

In [6]:
genres = movies.groupby('Genre')

In [9]:
# on this groupby object we have multiple function(aggregation)
genres.min()
# The data below depicts for each genre the min series title,released_year etc.
# Similarly we have multiple function eg. max,std,var,mean,mode,median etc.

Unnamed: 0_level_0,Series_Title,Released_Year,Runtime,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Action,300,1924,45,7.6,Abhishek Chaubey,Aamir Khan,25312,3296.0,33.0
Adventure,2001: A Space Odyssey,1925,88,7.6,Akira Kurosawa,Aamir Khan,29999,61001.0,41.0
Animation,Akira,1940,71,7.6,Adam Elliot,Adrian Molina,25229,128985.0,61.0
Biography,12 Years a Slave,1928,93,7.6,Adam McKay,Adrien Brody,27254,21877.0,48.0
Comedy,(500) Days of Summer,1921,68,7.6,Alejandro G. Iñárritu,Aamir Khan,26337,1305.0,45.0
Crime,12 Angry Men,1931,80,7.6,Akira Kurosawa,Ajay Devgn,27712,6013.0,47.0
Drama,1917,1925,64,7.6,Aamir Khan,Abhay Deol,25088,3600.0,28.0
Family,E.T. the Extra-Terrestrial,1971,100,7.8,Mel Stuart,Gene Wilder,178731,4000000.0,67.0
Fantasy,Das Cabinet des Dr. Caligari,1920,76,7.9,F.W. Murnau,Max Schreck,57428,337574718.0,
Film-Noir,Shadow of a Doubt,1941,100,7.8,Alfred Hitchcock,Humphrey Bogart,59556,449191.0,94.0


In [13]:
# Find the top 3 genre by total_earning
movies.groupby('Genre')['Gross'].sum().sort_values(ascending = False).head(3)

Genre
Drama     3.540997e+10
Action    3.263226e+10
Comedy    1.566387e+10
Name: Gross, dtype: float64

In [16]:
# Find the genre with highest avg IMDB Rating
movies.groupby('Genre')['IMDB_Rating'].mean().sort_values(ascending = False).head(1)

Genre
Western    8.35
Name: IMDB_Rating, dtype: float64

In [18]:
# Find Director with most popularity
# Assuming no_of_votes as a parameter for popularity
movies.groupby('Director')['No_of_Votes'].sum().sort_values(ascending=False).head(1)

Director
Christopher Nolan    11578345
Name: No_of_Votes, dtype: int64

In [19]:
# Find the highest rated movie of each genre


In [23]:
# Find number of movies done by each actor
movies.groupby('Star1')['Series_Title'].count()

Star1
Aamir Khan              7
Aaron Taylor-Johnson    1
Abhay Deol              1
Abraham Attah           1
Adam Driver             1
                       ..
Zbigniew Zamachowski    1
Zooey Deschanel         1
Çetin Tekindor          1
Éric Toledano           1
Ömer Faruk Sorak        1
Name: Series_Title, Length: 660, dtype: int64

### GroupBy Attributes and Methods
* find total number of groups -> len
* find items in each group -> size
* first()/last() -> nth item
* get_group -> vs filtering
* groups
* describe
* sample
* nunique

In [24]:
len(movies.groupby('Genre'))

14

In [27]:
movies.groupby('Genre').size()
# Every group has how many rows.

Genre
Action       172
Adventure     72
Animation     82
Biography     88
Comedy       155
Crime        107
Drama        289
Family         2
Fantasy        2
Film-Noir      3
Horror        11
Mystery       12
Thriller       1
Western        4
dtype: int64

In [33]:
# genres.first()
# movies.groupby('Genre').nth(6)
genres.last()

Unnamed: 0_level_0,Series_Title,Released_Year,Runtime,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Action,Escape from Alcatraz,1979,112,7.6,Don Siegel,Clint Eastwood,121731,43000000.0,76.0
Adventure,Kelly's Heroes,1970,144,7.6,Brian G. Hutton,Clint Eastwood,45338,1378435.0,50.0
Animation,The Jungle Book,1967,78,7.6,Wolfgang Reitherman,Phil Harris,166409,141843612.0,65.0
Biography,Midnight Express,1978,121,7.6,Alan Parker,Brad Davis,73662,35000000.0,59.0
Comedy,Breakfast at Tiffany's,1961,115,7.6,Blake Edwards,Audrey Hepburn,166544,679874270.0,76.0
Crime,The 39 Steps,1935,86,7.6,Alfred Hitchcock,Robert Donat,51853,302787539.0,93.0
Drama,Lifeboat,1944,97,7.6,Alfred Hitchcock,Tallulah Bankhead,26471,852142728.0,78.0
Family,Willy Wonka & the Chocolate Factory,1971,100,7.8,Mel Stuart,Gene Wilder,178731,4000000.0,67.0
Fantasy,Nosferatu,1922,94,7.9,F.W. Murnau,Max Schreck,88794,445151978.0,
Film-Noir,Shadow of a Doubt,1943,108,7.8,Alfred Hitchcock,Teresa Wright,59556,123353292.0,94.0


In [36]:
genres.get_group('Western')

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
12,"Il buono, il brutto, il cattivo",1966,161,Western,8.8,Sergio Leone,Clint Eastwood,688390,6100000.0,90.0
48,Once Upon a Time in the West,1968,165,Western,8.5,Sergio Leone,Henry Fonda,302844,5321508.0,80.0
115,Per qualche dollaro in più,1965,132,Western,8.3,Sergio Leone,Clint Eastwood,232772,15000000.0,74.0
691,The Outlaw Josey Wales,1976,135,Western,7.8,Clint Eastwood,Clint Eastwood,65659,31800000.0,69.0


In [38]:
# describe
genres.describe()

Unnamed: 0_level_0,Runtime,Runtime,Runtime,Runtime,Runtime,Runtime,Runtime,Runtime,IMDB_Rating,IMDB_Rating,...,Gross,Gross,Metascore,Metascore,Metascore,Metascore,Metascore,Metascore,Metascore,Metascore
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Genre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Action,172.0,129.046512,28.500706,45.0,110.75,127.5,143.25,321.0,172.0,7.949419,...,267443700.0,936662225.0,143.0,73.41958,12.421252,33.0,65.0,74.0,82.0,98.0
Adventure,72.0,134.111111,33.31732,88.0,109.0,127.0,149.0,228.0,72.0,7.9375,...,199807000.0,874211619.0,64.0,78.4375,12.345393,41.0,69.75,80.5,87.25,100.0
Animation,82.0,99.585366,14.530471,71.0,90.0,99.5,106.75,137.0,82.0,7.930488,...,252061200.0,873839108.0,75.0,81.093333,8.813646,61.0,75.0,82.0,87.5,96.0
Biography,88.0,136.022727,25.514466,93.0,120.0,129.0,146.25,209.0,88.0,7.938636,...,98299240.0,753585104.0,79.0,76.240506,11.028187,48.0,70.5,76.0,84.5,97.0
Comedy,155.0,112.129032,22.946213,68.0,96.0,106.0,124.5,188.0,155.0,7.90129,...,81078090.0,886752933.0,125.0,78.72,11.82916,45.0,72.0,79.0,88.0,99.0
Crime,107.0,126.392523,27.689231,80.0,106.5,122.0,141.5,229.0,107.0,8.016822,...,71021630.0,790482117.0,87.0,77.08046,13.099102,47.0,69.5,77.0,87.0,100.0
Drama,289.0,124.737024,27.74049,64.0,105.0,121.0,137.0,242.0,289.0,7.957439,...,116446100.0,924558264.0,241.0,79.701245,12.744687,28.0,72.0,82.0,89.0,100.0
Family,2.0,107.5,10.606602,100.0,103.75,107.5,111.25,115.0,2.0,7.8,...,327332900.0,435110554.0,2.0,79.0,16.970563,67.0,73.0,79.0,85.0,91.0
Fantasy,2.0,85.0,12.727922,76.0,80.5,85.0,89.5,94.0,2.0,8.0,...,418257700.0,445151978.0,0.0,,,,,,,
Film-Noir,3.0,104.0,4.0,100.0,102.0,104.0,106.0,108.0,3.0,7.966667,...,62730680.0,123353292.0,3.0,95.666667,1.527525,94.0,95.0,96.0,96.5,97.0


In [39]:
genres.sample()

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
155,Batman Begins,2005,140,Action,8.2,Christopher Nolan,Christian Bale,1308302,206852432.0,70.0
513,Children of Men,2006,109,Adventure,7.9,Alfonso Cuarón,Julianne Moore,465113,35552383.0,84.0
694,La planète sauvage,1973,72,Animation,7.8,René Laloux,Barry Bostwick,25229,193817.0,73.0
949,Blow,2001,124,Biography,7.6,Ted Demme,Johnny Depp,240714,52990775.0,52.0
701,Guess Who's Coming to Dinner,1967,108,Comedy,7.8,Stanley Kramer,Spencer Tracy,39642,56700000.0,63.0
639,Lilja 4-ever,2002,109,Crime,7.8,Lukas Moodysson,Oksana Akinshina,42673,181655.0,82.0
203,Room,2015,118,Drama,8.1,Lenny Abrahamson,Brie Larson,371538,14677674.0,86.0
698,Willy Wonka & the Chocolate Factory,1971,100,Family,7.8,Mel Stuart,Gene Wilder,178731,4000000.0,67.0
321,Das Cabinet des Dr. Caligari,1920,76,Fantasy,8.1,Robert Wiene,Werner Krauss,57428,337574718.0,
456,The Maltese Falcon,1941,100,Film-Noir,8.0,John Huston,Humphrey Bogart,148928,2108060.0,96.0


In [40]:
genres.nunique()

Unnamed: 0_level_0,Series_Title,Released_Year,Runtime,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Action,172,61,78,15,123,121,172,172,50
Adventure,72,49,58,10,59,59,72,72,33
Animation,82,35,41,11,51,77,82,82,29
Biography,88,44,56,13,76,72,88,88,40
Comedy,155,72,70,11,113,133,155,155,44
Crime,106,56,65,14,86,85,107,107,39
Drama,289,83,95,14,211,250,288,287,52
Family,2,2,2,1,2,2,2,2,2
Fantasy,2,2,2,2,2,2,2,2,0
Film-Noir,3,3,3,3,3,3,3,3,3


In [42]:
# agg method -> dict as parameter
# to perform/apply multiple aggregation functions on multiple columns in one go
genres.agg(
    {
        'Runtime':'mean',
        'IMDB_Rating':'mean',
        'No_of_Votes':'sum',
        'Gross':'sum',
        'Metascore':'min'
    }
)

Unnamed: 0_level_0,Runtime,IMDB_Rating,No_of_Votes,Gross,Metascore
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Action,129.046512,7.949419,72282412,32632260000.0,33.0
Adventure,134.111111,7.9375,22576163,9496922000.0,41.0
Animation,99.585366,7.930488,21978630,14631470000.0,61.0
Biography,136.022727,7.938636,24006844,8276358000.0,48.0
Comedy,112.129032,7.90129,27620327,15663870000.0,45.0
Crime,126.392523,8.016822,33533615,8452632000.0,47.0
Drama,124.737024,7.957439,61367304,35409970000.0,28.0
Family,107.5,7.8,551221,439110600.0,67.0
Fantasy,85.0,8.0,146222,782726700.0,
Film-Noir,104.0,7.966667,367215,125910500.0,94.0


In [45]:
genres.agg(['min','max','sum'])

Unnamed: 0_level_0,Series_Title,Series_Title,Series_Title,Released_Year,Released_Year,Released_Year,Runtime,Runtime,Runtime,IMDB_Rating,...,Star1,No_of_Votes,No_of_Votes,No_of_Votes,Gross,Gross,Gross,Metascore,Metascore,Metascore
Unnamed: 0_level_1,min,max,sum,min,max,sum,min,max,sum,min,...,sum,min,max,sum,min,max,sum,min,max,sum
Genre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Action,300,Yôjinbô,The Dark KnightThe Lord of the Rings: The Retu...,1924,2019,2008200320102001200219991980197719621954200019...,45,321,22196,7.6,...,Christian BaleElijah WoodLeonardo DiCaprioElij...,25312,2303232,72282412,3296.0,936662225.0,32632260000.0,33.0,98.0,10499.0
Adventure,2001: A Space Odyssey,Zombieland,InterstellarBack to the FutureInglourious Bast...,1925,PG,2014198520091981196819621959201319751963194819...,88,228,9656,7.6,...,Matthew McConaugheyMichael J. FoxBrad PittJürg...,29999,1512360,22576163,61001.0,874211619.0,9496922000.0,41.0,100.0,5020.0
Animation,Akira,Ôkami kodomo no Ame to Yuki,Sen to Chihiro no kamikakushiThe Lion KingHota...,1940,2020,2001199419882016201820172008199719952019200920...,71,137,8166,7.6,...,Daveigh ChaseRob MinkoffTsutomu TatsumiRyûnosu...,25229,999790,21978630,128985.0,873839108.0,14631470000.0,61.0,96.0,6082.0
Biography,12 Years a Slave,Zerkalo,Schindler's ListGoodfellasHamiltonThe Intoucha...,1928,2020,1993199020202011200220171995198420182013201320...,93,209,11970,7.6,...,Liam NeesonRobert De NiroLin-Manuel MirandaÉri...,27254,1213505,24006844,21877.0,753585104.0,8276358000.0,48.0,97.0,6023.0
Comedy,(500) Days of Summer,Zindagi Na Milegi Dobara,GisaengchungLa vita è bellaModern TimesCity Li...,1921,2020,2019199719361931200919641940200120001973196019...,68,188,17380,7.6,...,Kang-ho SongRoberto BenigniCharles ChaplinChar...,26337,939631,27620327,1305.0,886752933.0,15663870000.0,45.0,99.0,9840.0
Crime,12 Angry Men,À bout de souffle,The GodfatherThe Godfather: Part II12 Angry Me...,1931,2019,1972197419571994200219991995199120192006199519...,80,229,13524,7.6,...,Marlon BrandoAl PacinoHenry FondaJohn Travolta...,27712,1826188,33533615,6013.0,790482117.0,8452632000.0,47.0,100.0,6706.0
Drama,1917,Zwartboek,The Shawshank RedemptionFight ClubForrest Gump...,1925,2020,1994199919941975202019981946201420061998198819...,64,242,36049,7.6,...,Tim RobbinsBrad PittTom HanksJack NicholsonSur...,25088,2343110,61367304,3600.0,924558264.0,35409970000.0,28.0,100.0,19208.0
Family,E.T. the Extra-Terrestrial,Willy Wonka & the Chocolate Factory,E.T. the Extra-TerrestrialWilly Wonka & the Ch...,1971,1982,19821971,100,115,215,7.8,...,Henry ThomasGene Wilder,178731,372490,551221,4000000.0,435110554.0,439110600.0,67.0,91.0,158.0
Fantasy,Das Cabinet des Dr. Caligari,Nosferatu,Das Cabinet des Dr. CaligariNosferatu,1920,1922,19201922,76,94,170,7.9,...,Werner KraussMax Schreck,57428,88794,146222,337574718.0,445151978.0,782726700.0,,,0.0
Film-Noir,Shadow of a Doubt,The Third Man,The Third ManThe Maltese FalconShadow of a Doubt,1941,1949,194919411943,100,108,312,7.8,...,Orson WellesHumphrey BogartTeresa Wright,59556,158731,367215,449191.0,123353292.0,125910500.0,94.0,97.0,287.0


In [46]:
genres.agg(
    {
        'Runtime':['min','mean'],
        'IMDB_Rating':'mean',
        'No_of_Votes':['sum','max'],
        'Gross':'sum',
        'Metascore':'min'
    }
)

Unnamed: 0_level_0,Runtime,Runtime,IMDB_Rating,No_of_Votes,No_of_Votes,Gross,Metascore
Unnamed: 0_level_1,min,mean,mean,sum,max,sum,min
Genre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Action,45,129.046512,7.949419,72282412,2303232,32632260000.0,33.0
Adventure,88,134.111111,7.9375,22576163,1512360,9496922000.0,41.0
Animation,71,99.585366,7.930488,21978630,999790,14631470000.0,61.0
Biography,93,136.022727,7.938636,24006844,1213505,8276358000.0,48.0
Comedy,68,112.129032,7.90129,27620327,939631,15663870000.0,45.0
Crime,80,126.392523,8.016822,33533615,1826188,8452632000.0,47.0
Drama,64,124.737024,7.957439,61367304,2343110,35409970000.0,28.0
Family,100,107.5,7.8,551221,372490,439110600.0,67.0
Fantasy,76,85.0,8.0,146222,88794,782726700.0,
Film-Noir,100,104.0,7.966667,367215,158731,125910500.0,94.0


### Looping on Groups

In [51]:
for group,data in genres:
    print(data[data['IMDB_Rating'] == data['IMDB_Rating'].max()])
# Find the highest rated movie of each genre


      Series_Title Released_Year  Runtime   Genre  IMDB_Rating  \
2  The Dark Knight          2008      152  Action          9.0   

            Director           Star1  No_of_Votes        Gross  Metascore  
2  Christopher Nolan  Christian Bale      2303232  534858444.0       84.0  
    Series_Title Released_Year  Runtime      Genre  IMDB_Rating  \
21  Interstellar          2014      169  Adventure          8.6   

             Director                Star1  No_of_Votes        Gross  \
21  Christopher Nolan  Matthew McConaughey      1512360  188020017.0   

    Metascore  
21       74.0  
                     Series_Title Released_Year  Runtime      Genre  \
23  Sen to Chihiro no kamikakushi          2001      125  Animation   

    IMDB_Rating        Director          Star1  No_of_Votes       Gross  \
23          8.6  Hayao Miyazaki  Daveigh Chase       651376  10055859.0   

    Metascore  
23       96.0  
       Series_Title Released_Year  Runtime      Genre  IMDB_Rating  \
7  Schi

In [53]:
# Apply function
# split - apply -> combine

In [55]:
# Question: find number of movies starting with A for each group
def gc(group):
    return group['Series_Title'].str.startswith('A').sum()
genres.apply(gc)

  genres.apply(gc)


Genre
Action       10
Adventure     2
Animation     2
Biography     9
Comedy       14
Crime         4
Drama        21
Family        0
Fantasy       0
Film-Noir     0
Horror        1
Mystery       0
Thriller      0
Western       0
dtype: int64

In [57]:
# find ranking of each movie in the group according to IMDB score
def rank_movie(group):
    group['Genre_rank'] = group['IMDB_Rating'].rank(ascending = False)
    return group
genres.apply(rank_movie)

  genres.apply(rank_movie)


Unnamed: 0_level_0,Unnamed: 1_level_0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore,Genre_rank
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Action,2,The Dark Knight,2008,152,Action,9.0,Christopher Nolan,Christian Bale,2303232,534858444.0,84.0,1.0
Action,5,The Lord of the Rings: The Return of the King,2003,201,Action,8.9,Peter Jackson,Elijah Wood,1642758,377845905.0,94.0,2.0
Action,8,Inception,2010,148,Action,8.8,Christopher Nolan,Leonardo DiCaprio,2067042,292576195.0,74.0,3.5
Action,10,The Lord of the Rings: The Fellowship of the Ring,2001,178,Action,8.8,Peter Jackson,Elijah Wood,1661481,315544750.0,92.0,3.5
Action,13,The Lord of the Rings: The Two Towers,2002,179,Action,8.7,Peter Jackson,Elijah Wood,1485555,342551365.0,87.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
Thriller,700,Wait Until Dark,1967,108,Thriller,7.8,Terence Young,Audrey Hepburn,27733,17550741.0,81.0,1.0
Western,12,"Il buono, il brutto, il cattivo",1966,161,Western,8.8,Sergio Leone,Clint Eastwood,688390,6100000.0,90.0,1.0
Western,48,Once Upon a Time in the West,1968,165,Western,8.5,Sergio Leone,Henry Fonda,302844,5321508.0,80.0,2.0
Western,115,Per qualche dollaro in più,1965,132,Western,8.3,Sergio Leone,Clint Eastwood,232772,15000000.0,74.0,3.0


### Group by on multiple Columns

In [59]:
movies.groupby(['Director','Star1']).size()

Director             Star1         
Aamir Khan           Amole Gupte       1
Aaron Sorkin         Eddie Redmayne    1
Abdellatif Kechiche  Léa Seydoux       1
Abhishek Chaubey     Shahid Kapoor     1
Abhishek Kapoor      Amit Sadh         1
                                      ..
Zaza Urushadze       Lembit Ulfsak     1
Zoya Akhtar          Hrithik Roshan    1
                     Vijay Varma       1
Çagan Irmak          Çetin Tekindor    1
Ömer Faruk Sorak     Cem Yilmaz        1
Length: 898, dtype: int64

In [63]:
# find the best(in-terms of metascore(avg)) actor->genre combo
movies.groupby(['Star1','Genre'])['Metascore'].mean().reset_index().sort_values('Metascore',ascending = False).head(1)

Unnamed: 0,Star1,Genre,Metascore
230,Ellar Coltrane,Drama,100.0


### Exercise / Practice

In [66]:
ipl = pd.read_csv(dataset('https://drive.google.com/uc?export=download&id=1xzluAOVoDwpcFg0YiqF0EOUAh__SK2b4'))
ipl.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,4,0,4,,,
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,2,2,,,


In [68]:
ipl.shape

(179078, 21)

In [71]:
# find the top 10 batsman in terms of runs
ipl.groupby('batsman')['batsman_runs'].sum().sort_values(ascending=False).head(10)

batsman
V Kohli           5434
SK Raina          5415
RG Sharma         4914
DA Warner         4741
S Dhawan          4632
CH Gayle          4560
MS Dhoni          4477
RV Uthappa        4446
AB de Villiers    4428
G Gambhir         4223
Name: batsman_runs, dtype: int64

In [78]:
# find the batsman with max no of sixes
sixes = ipl[ipl['batsman_runs'] == 6]
sixes.groupby('batsman')['batsman'].count().sort_values(ascending=False).head(1).index[0]

'CH Gayle'

In [81]:
# find batsman with most number of 4's and 6's in last 5 overs
ipl[((ipl['batsman_runs'] == 4) | (ipl['batsman_runs'] == 6)) & (ipl['over']>15)].groupby('batsman')['batsman'].count().sort_values(ascending=False).head(1).index[0]


'MS Dhoni'

In [82]:
# find V Kohli's record against all teams
temp_df = ipl[ipl['batsman'] == 'V Kohli']
temp_df.groupby('bowling_team')['batsman_runs'].sum().reset_index()

Unnamed: 0,bowling_team,batsman_runs
0,Chennai Super Kings,749
1,Deccan Chargers,306
2,Delhi Capitals,66
3,Delhi Daredevils,763
4,Gujarat Lions,283
5,Kings XI Punjab,636
6,Kochi Tuskers Kerala,50
7,Kolkata Knight Riders,675
8,Mumbai Indians,628
9,Pune Warriors,128
