## Group by Objects

In [37]:
import pandas as pd
import numpy as np

In [38]:
movies = pd.read_csv('datasets/imdb-top-1000.csv')

In [39]:
movies

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,The Shawshank Redemption,1994,142,Drama,9.3,Frank Darabont,Tim Robbins,2343110,28341469.0,80.0
1,The Godfather,1972,175,Crime,9.2,Francis Ford Coppola,Marlon Brando,1620367,134966411.0,100.0
2,The Dark Knight,2008,152,Action,9.0,Christopher Nolan,Christian Bale,2303232,534858444.0,84.0
3,The Godfather: Part II,1974,202,Crime,9.0,Francis Ford Coppola,Al Pacino,1129952,57300000.0,90.0
4,12 Angry Men,1957,96,Crime,9.0,Sidney Lumet,Henry Fonda,689845,4360000.0,96.0
...,...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,115,Comedy,7.6,Blake Edwards,Audrey Hepburn,166544,679874270.0,76.0
996,Giant,1956,201,Drama,7.6,George Stevens,Elizabeth Taylor,34075,195217415.0,84.0
997,From Here to Eternity,1953,118,Drama,7.6,Fred Zinnemann,Burt Lancaster,43374,30500000.0,85.0
998,Lifeboat,1944,97,Drama,7.6,Alfred Hitchcock,Tallulah Bankhead,26471,852142728.0,78.0


In [40]:
genres = movies.groupby('Genre')

In [41]:
# applyig bult-in aggregation(sum,mean,max,etc) functions on groupby objects
genres.mean(numeric_only=True)
genres.max()


Unnamed: 0_level_0,Series_Title,Released_Year,Runtime,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Action,Yôjinbô,2019,321,9.0,Zack Snyder,Yun-Fat Chow,2303232,936662225.0,98.0
Adventure,Zombieland,PG,228,8.6,Ömer Faruk Sorak,Yves Montand,1512360,874211619.0,100.0
Animation,Ôkami kodomo no Ame to Yuki,2020,137,8.6,Yoshifumi Kondô,Yôji Matsuda,999790,873839108.0,96.0
Biography,Zerkalo,2020,209,8.9,Tom McCarthy,Éric Toledano,1213505,753585104.0,97.0
Comedy,Zindagi Na Milegi Dobara,2020,188,8.6,Zoya Akhtar,Ömer Faruk Sorak,939631,886752933.0,99.0
Crime,À bout de souffle,2019,229,9.2,Yavuz Turgul,Vincent Cassel,1826188,790482117.0,100.0
Drama,Zwartboek,2020,242,9.3,Çagan Irmak,Çetin Tekindor,2343110,924558264.0,100.0
Family,Willy Wonka & the Chocolate Factory,1982,115,7.8,Steven Spielberg,Henry Thomas,372490,435110554.0,91.0
Fantasy,Nosferatu,1922,94,8.1,Robert Wiene,Werner Krauss,88794,445151978.0,
Film-Noir,The Third Man,1949,108,8.1,John Huston,Teresa Wright,158731,123353292.0,97.0


In [42]:
# find the top 3 genres by total earning
movies.groupby('Genre').sum()['Gross'].sort_values(ascending=False).head(3) 

Genre
Drama     3.540997e+10
Action    3.263226e+10
Comedy    1.566387e+10
Name: Gross, dtype: float64

In [43]:
# more acurate and fast
movies.groupby('Genre')['Gross'].sum().sort_values(ascending=False).head(3)

Genre
Drama     3.540997e+10
Action    3.263226e+10
Comedy    1.566387e+10
Name: Gross, dtype: float64

In [44]:
# find the genre with highest avg IMDB rating
movies.groupby('Genre')['IMDB_Rating'].mean(numeric_only=True).sort_values(ascending=False).head(1)

Genre
Western    8.35
Name: IMDB_Rating, dtype: float64

In [45]:
#find director with most popularity
movies.groupby('Director')['No_of_Votes'].sum().sort_values(ascending=False).head(1)

Director
Christopher Nolan    11578345
Name: No_of_Votes, dtype: int64

In [46]:
# find the highest rated movie of each genre
movies.groupby('Genre')['IMDB_Rating'].max()

Genre
Action       9.0
Adventure    8.6
Animation    8.6
Biography    8.9
Comedy       8.6
Crime        9.2
Drama        9.3
Family       7.8
Fantasy      8.1
Film-Noir    8.1
Horror       8.5
Mystery      8.4
Thriller     7.8
Western      8.8
Name: IMDB_Rating, dtype: float64

In [47]:
# find number of movies done by each actor
movies.groupby('Star1')['Series_Title'].count().sort_values(ascending=False)


Star1
Tom Hanks               12
Robert De Niro          11
Clint Eastwood          10
Al Pacino               10
Humphrey Bogart          9
                        ..
Zbigniew Zamachowski     1
Zooey Deschanel          1
Çetin Tekindor           1
Éric Toledano            1
Aaron Taylor-Johnson     1
Name: Series_Title, Length: 660, dtype: int64

### GroupBy Attributes and Methods

In [48]:
# find total number of groups -> len
len(movies.groupby('Genre'))

14

In [49]:
movies['Genre'].nunique()

14

In [50]:
# rows in group
movies.groupby('Genre').size()

Genre
Action       172
Adventure     72
Animation     82
Biography     88
Comedy       155
Crime        107
Drama        289
Family         2
Fantasy        2
Film-Noir      3
Horror        11
Mystery       12
Thriller       1
Western        4
dtype: int64

In [51]:
movies['Genre'].value_counts()

Genre
Drama        289
Action       172
Comedy       155
Crime        107
Biography     88
Animation     82
Adventure     72
Mystery       12
Horror        11
Western        4
Film-Noir      3
Fantasy        2
Family         2
Thriller       1
Name: count, dtype: int64

In [52]:
# first()/ last() -> nth item
genres = movies.groupby('Genre')
genres.first() # give first movies of every genre in data
genres.last()
genres.nth(5)

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
14,The Matrix,1999,136,Action,8.7,Lana Wachowski,Lilly Wachowski,1676426,171479930.0,73.0
24,Saving Private Ryan,1998,169,Drama,8.6,Steven Spielberg,Tom Hanks,1235804,216540909.0,91.0
25,The Green Mile,1999,189,Crime,8.6,Frank Darabont,Tom Hanks,1147794,136801374.0,61.0
54,Ayla: The Daughter of War,2017,125,Biography,8.4,Can Ulkay,Erdem Can,34112,679278040.0,
61,Coco,2017,105,Animation,8.4,Lee Unkrich,Adrian Molina,384171,209726015.0,81.0
78,Dr. Strangelove or: How I Learned to Stop Worr...,1964,95,Comedy,8.4,Stanley Kubrick,Peter Sellers,450474,275902.0,97.0
116,Lawrence of Arabia,1962,228,Adventure,8.3,David Lean,Peter O'Toole,268085,44824144.0,100.0
393,Twelve Monkeys,1995,129,Mystery,8.0,Terry Gilliam,Bruce Willis,578443,57141459.0,74.0
707,The Innocents,1961,100,Horror,7.8,Jack Clayton,Deborah Kerr,27007,2616000.0,88.0


In [53]:
# get_group -> vs  filtering
genres.get_group('Fantasy') # list all movies in particular group

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
321,Das Cabinet des Dr. Caligari,1920,76,Fantasy,8.1,Robert Wiene,Werner Krauss,57428,337574718.0,
568,Nosferatu,1922,94,Fantasy,7.9,F.W. Murnau,Max Schreck,88794,445151978.0,


In [54]:
movies[movies['Genre']== 'Fantasy']

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
321,Das Cabinet des Dr. Caligari,1920,76,Fantasy,8.1,Robert Wiene,Werner Krauss,57428,337574718.0,
568,Nosferatu,1922,94,Fantasy,7.9,F.W. Murnau,Max Schreck,88794,445151978.0,


In [55]:
genres.groups # give values of movies in group

{'Action': [2, 5, 8, 10, 13, 14, 16, 29, 30, 31, 39, 42, 44, 55, 57, 59, 60, 63, 68, 72, 106, 109, 129, 130, 134, 140, 142, 144, 152, 155, 160, 161, 166, 168, 171, 172, 177, 181, 194, 201, 202, 216, 217, 223, 224, 236, 241, 262, 275, 294, 308, 320, 325, 326, 331, 337, 339, 340, 343, 345, 348, 351, 353, 356, 357, 362, 368, 369, 375, 376, 390, 410, 431, 436, 473, 477, 479, 482, 488, 493, 496, 502, 507, 511, 532, 535, 540, 543, 564, 569, 570, 573, 577, 582, 583, 602, 605, 608, 615, 623, ...], 'Adventure': [21, 47, 93, 110, 114, 116, 118, 137, 178, 179, 191, 193, 209, 226, 231, 247, 267, 273, 281, 300, 301, 304, 306, 323, 329, 361, 366, 377, 402, 406, 415, 426, 458, 470, 497, 498, 506, 513, 514, 537, 549, 552, 553, 566, 576, 604, 609, 618, 638, 647, 675, 681, 686, 692, 711, 713, 739, 755, 781, 797, 798, 851, 873, 884, 912, 919, 947, 957, 964, 966, 984, 991], 'Animation': [23, 43, 46, 56, 58, 61, 66, 70, 101, 135, 146, 151, 158, 170, 197, 205, 211, 213, 219, 229, 230, 242, 245, 246, 270, 33

In [56]:
# descibe
genres.describe()

Unnamed: 0_level_0,Runtime,Runtime,Runtime,Runtime,Runtime,Runtime,Runtime,Runtime,IMDB_Rating,IMDB_Rating,...,Gross,Gross,Metascore,Metascore,Metascore,Metascore,Metascore,Metascore,Metascore,Metascore
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Genre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Action,172.0,129.046512,28.500706,45.0,110.75,127.5,143.25,321.0,172.0,7.949419,...,267443700.0,936662225.0,143.0,73.41958,12.421252,33.0,65.0,74.0,82.0,98.0
Adventure,72.0,134.111111,33.31732,88.0,109.0,127.0,149.0,228.0,72.0,7.9375,...,199807000.0,874211619.0,64.0,78.4375,12.345393,41.0,69.75,80.5,87.25,100.0
Animation,82.0,99.585366,14.530471,71.0,90.0,99.5,106.75,137.0,82.0,7.930488,...,252061200.0,873839108.0,75.0,81.093333,8.813646,61.0,75.0,82.0,87.5,96.0
Biography,88.0,136.022727,25.514466,93.0,120.0,129.0,146.25,209.0,88.0,7.938636,...,98299240.0,753585104.0,79.0,76.240506,11.028187,48.0,70.5,76.0,84.5,97.0
Comedy,155.0,112.129032,22.946213,68.0,96.0,106.0,124.5,188.0,155.0,7.90129,...,81078090.0,886752933.0,125.0,78.72,11.82916,45.0,72.0,79.0,88.0,99.0
Crime,107.0,126.392523,27.689231,80.0,106.5,122.0,141.5,229.0,107.0,8.016822,...,71021630.0,790482117.0,87.0,77.08046,13.099102,47.0,69.5,77.0,87.0,100.0
Drama,289.0,124.737024,27.74049,64.0,105.0,121.0,137.0,242.0,289.0,7.957439,...,116446100.0,924558264.0,241.0,79.701245,12.744687,28.0,72.0,82.0,89.0,100.0
Family,2.0,107.5,10.606602,100.0,103.75,107.5,111.25,115.0,2.0,7.8,...,327332900.0,435110554.0,2.0,79.0,16.970563,67.0,73.0,79.0,85.0,91.0
Fantasy,2.0,85.0,12.727922,76.0,80.5,85.0,89.5,94.0,2.0,8.0,...,418257700.0,445151978.0,0.0,,,,,,,
Film-Noir,3.0,104.0,4.0,100.0,102.0,104.0,106.0,108.0,3.0,7.966667,...,62730680.0,123353292.0,3.0,95.666667,1.527525,94.0,95.0,96.0,96.5,97.0


In [57]:
# sample 
genres.sample(2,replace=True)

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
134,Baahubali 2: The Conclusion,2017,167,Action,8.2,S.S. Rajamouli,Prabhas,75348,20186659.0,
223,Mad Max: Fury Road,2015,120,Action,8.1,George Miller,Tom Hardy,882316,154058340.0,90.0
361,Blood Diamond,2006,143,Adventure,8.0,Edward Zwick,Leonardo DiCaprio,499439,57366262.0,64.0
114,2001: A Space Odyssey,1968,149,Adventure,8.3,Stanley Kubrick,Keir Dullea,603517,56954992.0,84.0
756,Gake no ue no Ponyo,2008,101,Animation,7.7,Hayao Miyazaki,Cate Blanchett,125317,15090400.0,86.0
796,Vampire Hunter D: Bloodlust,2000,103,Animation,7.7,Yoshiaki Kawajiri,Andrew Philpot,29210,151086.0,62.0
411,Gandhi,1982,191,Biography,8.0,Richard Attenborough,Ben Kingsley,217664,52767889.0,79.0
15,Goodfellas,1990,146,Biography,8.7,Martin Scorsese,Robert De Niro,1020727,46836394.0,90.0
547,Charade,1963,113,Comedy,7.9,Stanley Donen,Cary Grant,68689,13474588.0,83.0
563,His Girl Friday,1940,92,Comedy,7.9,Howard Hawks,Cary Grant,53667,296000.0,


In [58]:
# nunique
genres.nunique()

Unnamed: 0_level_0,Series_Title,Released_Year,Runtime,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Action,172,61,78,15,123,121,172,172,50
Adventure,72,49,58,10,59,59,72,72,33
Animation,82,35,41,11,51,77,82,82,29
Biography,88,44,56,13,76,72,88,88,40
Comedy,155,72,70,11,113,133,155,155,44
Crime,106,56,65,14,86,85,107,107,39
Drama,289,83,95,14,211,250,288,287,52
Family,2,2,2,1,2,2,2,2,2
Fantasy,2,2,2,2,2,2,2,2,0
Film-Noir,3,3,3,3,3,3,3,3,3


In [59]:
# agg method (passing dict)
genres.sum() # not best method

genres.agg(
    {
        'Runtime':'mean',
        'IMDB_Rating':'mean',
        'No_of_Votes':'sum'
    }
)

Unnamed: 0_level_0,Runtime,IMDB_Rating,No_of_Votes
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Action,129.046512,7.949419,72282412
Adventure,134.111111,7.9375,22576163
Animation,99.585366,7.930488,21978630
Biography,136.022727,7.938636,24006844
Comedy,112.129032,7.90129,27620327
Crime,126.392523,8.016822,33533615
Drama,124.737024,7.957439,61367304
Family,107.5,7.8,551221
Fantasy,85.0,8.0,146222
Film-Noir,104.0,7.966667,367215


In [None]:
# passing list
genres.agg(['min','max','sum']) # q why mean is showing error here

Unnamed: 0_level_0,Series_Title,Series_Title,Series_Title,Released_Year,Released_Year,Released_Year,Runtime,Runtime,Runtime,IMDB_Rating,...,Star1,No_of_Votes,No_of_Votes,No_of_Votes,Gross,Gross,Gross,Metascore,Metascore,Metascore
Unnamed: 0_level_1,min,max,sum,min,max,sum,min,max,sum,min,...,sum,min,max,sum,min,max,sum,min,max,sum
Genre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Action,300,Yôjinbô,The Dark KnightThe Lord of the Rings: The Retu...,1924,2019,2008200320102001200219991980197719621954200019...,45,321,22196,7.6,...,Christian BaleElijah WoodLeonardo DiCaprioElij...,25312,2303232,72282412,3296.0,936662225.0,32632260000.0,33.0,98.0,10499.0
Adventure,2001: A Space Odyssey,Zombieland,InterstellarBack to the FutureInglourious Bast...,1925,PG,2014198520091981196819621959201319751963194819...,88,228,9656,7.6,...,Matthew McConaugheyMichael J. FoxBrad PittJürg...,29999,1512360,22576163,61001.0,874211619.0,9496922000.0,41.0,100.0,5020.0
Animation,Akira,Ôkami kodomo no Ame to Yuki,Sen to Chihiro no kamikakushiThe Lion KingHota...,1940,2020,2001199419882016201820172008199719952019200920...,71,137,8166,7.6,...,Daveigh ChaseRob MinkoffTsutomu TatsumiRyûnosu...,25229,999790,21978630,128985.0,873839108.0,14631470000.0,61.0,96.0,6082.0
Biography,12 Years a Slave,Zerkalo,Schindler's ListGoodfellasHamiltonThe Intoucha...,1928,2020,1993199020202011200220171995198420182013201320...,93,209,11970,7.6,...,Liam NeesonRobert De NiroLin-Manuel MirandaÉri...,27254,1213505,24006844,21877.0,753585104.0,8276358000.0,48.0,97.0,6023.0
Comedy,(500) Days of Summer,Zindagi Na Milegi Dobara,GisaengchungLa vita è bellaModern TimesCity Li...,1921,2020,2019199719361931200919641940200120001973196019...,68,188,17380,7.6,...,Kang-ho SongRoberto BenigniCharles ChaplinChar...,26337,939631,27620327,1305.0,886752933.0,15663870000.0,45.0,99.0,9840.0
Crime,12 Angry Men,À bout de souffle,The GodfatherThe Godfather: Part II12 Angry Me...,1931,2019,1972197419571994200219991995199120192006199519...,80,229,13524,7.6,...,Marlon BrandoAl PacinoHenry FondaJohn Travolta...,27712,1826188,33533615,6013.0,790482117.0,8452632000.0,47.0,100.0,6706.0
Drama,1917,Zwartboek,The Shawshank RedemptionFight ClubForrest Gump...,1925,2020,1994199919941975202019981946201420061998198819...,64,242,36049,7.6,...,Tim RobbinsBrad PittTom HanksJack NicholsonSur...,25088,2343110,61367304,3600.0,924558264.0,35409970000.0,28.0,100.0,19208.0
Family,E.T. the Extra-Terrestrial,Willy Wonka & the Chocolate Factory,E.T. the Extra-TerrestrialWilly Wonka & the Ch...,1971,1982,19821971,100,115,215,7.8,...,Henry ThomasGene Wilder,178731,372490,551221,4000000.0,435110554.0,439110600.0,67.0,91.0,158.0
Fantasy,Das Cabinet des Dr. Caligari,Nosferatu,Das Cabinet des Dr. CaligariNosferatu,1920,1922,19201922,76,94,170,7.9,...,Werner KraussMax Schreck,57428,88794,146222,337574718.0,445151978.0,782726700.0,,,0.0
Film-Noir,Shadow of a Doubt,The Third Man,The Third ManThe Maltese FalconShadow of a Doubt,1941,1949,194919411943,100,108,312,7.8,...,Orson WellesHumphrey BogartTeresa Wright,59556,158731,367215,449191.0,123353292.0,125910500.0,94.0,97.0,287.0


In [64]:
# adding both syntax

genres.agg(
    {
        'Runtime':['mean','max'],
        'IMDB_Rating':'mean',
        'No_of_Votes':'sum'
    }
)

Unnamed: 0_level_0,Runtime,Runtime,IMDB_Rating,No_of_Votes
Unnamed: 0_level_1,mean,max,mean,sum
Genre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Action,129.046512,321,7.949419,72282412
Adventure,134.111111,228,7.9375,22576163
Animation,99.585366,137,7.930488,21978630
Biography,136.022727,209,7.938636,24006844
Comedy,112.129032,188,7.90129,27620327
Crime,126.392523,229,8.016822,33533615
Drama,124.737024,242,7.957439,61367304
Family,107.5,115,7.8,551221
Fantasy,85.0,94,8.0,146222
Film-Noir,104.0,108,7.966667,367215
