In [11]:
import pandas as pd
import numpy as np

pd.set_option('display.float_format', '{:.2f}'.format)

## Groupby Using Split-Apply-Combine

### Step 1: Splitting the Data
### Step 2: Applying a Function
### Step 3: Combining the Results

In [2]:
df = pd.read_csv('imdb.csv')
df.head()

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,The Shawshank Redemption,1994,142,Drama,9.3,Frank Darabont,Tim Robbins,2343110,28341469.0,80.0
1,The Godfather,1972,175,Crime,9.2,Francis Ford Coppola,Marlon Brando,1620367,134966411.0,100.0
2,The Dark Knight,2008,152,Action,9.0,Christopher Nolan,Christian Bale,2303232,534858444.0,84.0
3,The Godfather: Part II,1974,202,Crime,9.0,Francis Ford Coppola,Al Pacino,1129952,57300000.0,90.0
4,12 Angry Men,1957,96,Crime,9.0,Sidney Lumet,Henry Fonda,689845,4360000.0,96.0


In [3]:
grouped_by_genre = df.groupby('Genre')
grouped_by_genre

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001D7E38502F0>

In [4]:
for genre, group in grouped_by_genre:
    print(f'Genre: {genre}')
    print(group[['Series_Title', 'IMDB_Rating']].head())
    print()

Genre: Action
                                         Series_Title  IMDB_Rating
2                                     The Dark Knight          9.0
5       The Lord of the Rings: The Return of the King          8.9
8                                           Inception          8.8
10  The Lord of the Rings: The Fellowship of the Ring          8.8
13              The Lord of the Rings: The Two Towers          8.7

Genre: Adventure
              Series_Title  IMDB_Rating
21            Interstellar          8.6
47      Back to the Future          8.5
93    Inglourious Basterds          8.3
110               Das Boot          8.3
114  2001: A Space Odyssey          8.3

Genre: Animation
                         Series_Title  IMDB_Rating
23      Sen to Chihiro no kamikakushi          8.6
43                      The Lion King          8.5
46                     Hotaru no haka          8.5
56                     Kimi no na wa.          8.4
58  Spider-Man: Into the Spider-Verse          8.4

G

In [5]:
grouped_by_director = df.groupby('Director')

type(grouped_by_director)

pandas.core.groupby.generic.DataFrameGroupBy

In [6]:
movie_count_by_director = grouped_by_director.size()
movie_count_by_director

Director
Aamir Khan             1
Aaron Sorkin           1
Abdellatif Kechiche    1
Abhishek Chaubey       1
Abhishek Kapoor        1
                      ..
Zack Snyder            2
Zaza Urushadze         1
Zoya Akhtar            2
Çagan Irmak            1
Ömer Faruk Sorak       1
Length: 548, dtype: int64

we employ the size() method on the resulting GroupBy object, which computes
the size (count) of each group. This translates into the number of movies directed by
each individual.

## Built-in Aggregation Methods

### Statistical Calculations

In [7]:
grouped_data = df.groupby('Genre')

average_rating_by_genre = grouped_data['IMDB_Rating'].mean()
average_rating_by_genre

Genre
Action       7.949419
Adventure    7.937500
Animation    7.930488
Biography    7.938636
Comedy       7.901290
Crime        8.016822
Drama        7.957439
Family       7.800000
Fantasy      8.000000
Film-Noir    7.966667
Horror       7.909091
Mystery      7.975000
Thriller     7.800000
Western      8.350000
Name: IMDB_Rating, dtype: float64

In [12]:
total_gross_by_genre = grouped_data['Gross'].sum()
total_gross_by_genre

Genre
Action      32632261314.00
Adventure    9496922464.00
Animation   14631473048.00
Biography    8276357606.00
Comedy      15663868165.00
Crime        8452631908.00
Drama       35409974041.00
Family        439110554.00
Fantasy       782726696.00
Film-Noir     125910543.00
Horror       1034649238.00
Mystery      1256417015.00
Thriller       17550741.00
Western        58221508.00
Name: Gross, dtype: float64

In [13]:
lowest_rating_by_genre = grouped_data['IMDB_Rating'].min()
lowest_rating_by_genre

Genre
Action      7.60
Adventure   7.60
Animation   7.60
Biography   7.60
Comedy      7.60
Crime       7.60
Drama       7.60
Family      7.80
Fantasy     7.90
Film-Noir   7.80
Horror      7.60
Mystery     7.60
Thriller    7.80
Western     7.80
Name: IMDB_Rating, dtype: float64

In [14]:
highest_votes_by_genre = grouped_data['No_of_Votes'].max()
highest_votes_by_genre

Genre
Action       2303232
Adventure    1512360
Animation     999790
Biography    1213505
Comedy        939631
Crime        1826188
Drama        2343110
Family        372490
Fantasy        88794
Film-Noir     158731
Horror        787806
Mystery      1129894
Thriller       27733
Western       688390
Name: No_of_Votes, dtype: int64

In [15]:
valid_ratings_count_by_genre = grouped_data['IMDB_Rating'].count()
valid_ratings_count_by_genre

Genre
Action       172
Adventure     72
Animation     82
Biography     88
Comedy       155
Crime        107
Drama        289
Family         2
Fantasy        2
Film-Noir      3
Horror        11
Mystery       12
Thriller       1
Western        4
Name: IMDB_Rating, dtype: int64

In [16]:
grouped_data = df.groupby('Released_Year')

In [17]:
median_rating_by_year = grouped_data['IMDB_Rating'].median()
median_rating_by_year

Released_Year
1920   8.10
1921   8.30
1922   7.90
1924   8.20
1925   8.10
       ... 
2017   7.80
2018   8.00
2019   7.90
2020   8.00
PG     7.60
Name: IMDB_Rating, Length: 100, dtype: float64

In [18]:
grouped_data = df.groupby('Genre')

In [19]:
std_deviation_by_genre = grouped_data['IMDB_Rating'].std()
std_deviation_by_genre

Genre
Action      0.30
Adventure   0.23
Animation   0.25
Biography   0.27
Comedy      0.23
Crime       0.34
Drama       0.27
Family      0.00
Fantasy     0.14
Film-Noir   0.15
Horror      0.31
Mystery     0.31
Thriller     NaN
Western     0.42
Name: IMDB_Rating, dtype: float64

In [21]:
type(grouped_data)

pandas.core.groupby.generic.DataFrameGroupBy

In [22]:
grouped_data.get_group('Thriller')

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
700,Wait Until Dark,1967,108,Thriller,7.8,Terence Young,Audrey Hepburn,27733,17550741.0,81.0


In [23]:
var_by_genre = grouped_data['IMDB_Rating'].var()
var_by_genre

Genre
Action      0.09
Adventure   0.05
Animation   0.06
Biography   0.07
Comedy      0.05
Crime       0.11
Drama       0.07
Family      0.00
Fantasy     0.02
Film-Noir   0.02
Horror      0.10
Mystery     0.10
Thriller     NaN
Western     0.18
Name: IMDB_Rating, dtype: float64

## Applying Different Functions to DataFrame Columns

In [25]:
result = df.agg({
    'IMDB_Rating': 'mean',
    'No_of_Votes': 'sum'
})

result

IMDB_Rating           7.95
No_of_Votes   273692911.00
dtype: float64

In [26]:
result = df.agg({'Gross': 'max', 'Metascore': 'min'})
result

Gross       936662225.00
Metascore          28.00
dtype: float64

## Grouping DataFrame with Index Levels and Columns

In [27]:
grouped = df.groupby(['Genre', 'Released_Year']).agg({'IMDB_Rating': 'mean', 'No_of_Votes': 'sum'})
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,IMDB_Rating,No_of_Votes
Genre,Released_Year,Unnamed: 2_level_1,Unnamed: 3_level_1
Action,1924,8.20,41985
Action,1926,8.10,81156
Action,1932,7.80,25312
Action,1938,7.90,47175
Action,1948,7.80,65162
...,...,...,...
Thriller,1967,7.80,27733
Western,1965,8.30,232772
Western,1966,8.80,688390
Western,1968,8.50,302844


## Aggregation with User-defined Functions

In [31]:
df.head()

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,The Shawshank Redemption,1994,142,Drama,9.3,Frank Darabont,Tim Robbins,2343110,28341469.0,80.0
1,The Godfather,1972,175,Crime,9.2,Francis Ford Coppola,Marlon Brando,1620367,134966411.0,100.0
2,The Dark Knight,2008,152,Action,9.0,Christopher Nolan,Christian Bale,2303232,534858444.0,84.0
3,The Godfather: Part II,1974,202,Crime,9.0,Francis Ford Coppola,Al Pacino,1129952,57300000.0,90.0
4,12 Angry Men,1957,96,Crime,9.0,Sidney Lumet,Henry Fonda,689845,4360000.0,96.0


In [28]:
def custom_aggregation(group):
    high_rated = sum(group['IMDB_Rating'] > 8.0)
    low_rated = sum(group['IMDB_Rating'] <= 8.0)
    return pd.Series({'High_Rated': high_rated, 'Low_Rated': low_rated})

In [29]:
grouped_data = df.groupby('Genre').apply(custom_aggregation)
grouped_data

  grouped_data = df.groupby('Genre').apply(custom_aggregation)


Unnamed: 0_level_0,High_Rated,Low_Rated
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Action,52,120
Adventure,23,49
Animation,25,57
Biography,27,61
Comedy,39,116
Crime,42,65
Drama,101,188
Family,0,2
Fantasy,1,1
Film-Noir,1,2


In [32]:
grouped_data = df.groupby('Genre', include_groups=False).apply(custom_aggregation)
grouped_data

TypeError: DataFrame.groupby() got an unexpected keyword argument 'include_groups'

In [33]:
grouped_data = (
    df[['Genre', 'IMDB_Rating']]
        .groupby('Genre')
        .apply(custom_aggregation)
)

grouped_data

  .apply(custom_aggregation)


Unnamed: 0_level_0,High_Rated,Low_Rated
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Action,52,120
Adventure,23,49
Animation,25,57
Biography,27,61
Comedy,39,116
Crime,42,65
Drama,101,188
Family,0,2
Fantasy,1,1
Film-Noir,1,2


In [35]:
def custom_aggregation(series):
    high_rated = (series > 8.0).sum()
    low_rated = (series <= 8.0).sum()
    return pd.Series({'High_Rated': high_rated, 'Low_Rated': low_rated})

grouped_data = df.groupby('Genre')['IMDB_Rating'].apply(custom_aggregation)

grouped_data

Genre                
Action     High_Rated     52
           Low_Rated     120
Adventure  High_Rated     23
           Low_Rated      49
Animation  High_Rated     25
           Low_Rated      57
Biography  High_Rated     27
           Low_Rated      61
Comedy     High_Rated     39
           Low_Rated     116
Crime      High_Rated     42
           Low_Rated      65
Drama      High_Rated    101
           Low_Rated     188
Family     High_Rated      0
           Low_Rated       2
Fantasy    High_Rated      1
           Low_Rated       1
Film-Noir  High_Rated      1
           Low_Rated       2
Horror     High_Rated      3
           Low_Rated       8
Mystery    High_Rated      5
           Low_Rated       7
Thriller   High_Rated      0
           Low_Rated       1
Western    High_Rated      3
           Low_Rated       1
Name: IMDB_Rating, dtype: int64

In [36]:
pd.__version__

'2.3.1'

In [38]:
grouped_data = (
    df.groupby('Genre')
      .apply(custom_aggregation, include_groups=False)
)

grouped_data

TypeError: '>' not supported between instances of 'str' and 'float'

In [39]:
def custom_aggregation(group):
    high_rated = (group['IMDB_Rating'] > 8.0).sum()
    low_rated = (group['IMDB_Rating'] <= 8.0).sum()
    return pd.Series({'High_Rated': high_rated, 'Low_Rated': low_rated})

grouped_data = (
    df.groupby('Genre')
      .apply(custom_aggregation, include_groups=False)
)


In [40]:
grouped_data

Unnamed: 0_level_0,High_Rated,Low_Rated
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Action,52,120
Adventure,23,49
Animation,25,57
Biography,27,61
Comedy,39,116
Crime,42,65
Drama,101,188
Family,0,2
Fantasy,1,1
Film-Noir,1,2


In [41]:
def most_prolific_director(group):
    director_count = group.shape[0]  # Count of movies directed by the director
    return pd.Series({'Total_Movies': director_count})

In [42]:
# Grouping data by Director and applying the UDF
grouped_by_director = df.groupby('Director')
prolific_director_stats = grouped_by_director.apply(most_prolific_director)

  prolific_director_stats = grouped_by_director.apply(most_prolific_director)


In [43]:
# Grouping data by Director and applying the UDF
grouped_by_director = df.groupby('Director')
prolific_director_stats = grouped_by_director.apply(most_prolific_director, include_groups=False)

In [44]:
# Find the director with the most movies
most_prolific = prolific_director_stats['Total_Movies'].idxmax()
print(f"The most prolific director is '{most_prolific}' with {prolific_director_stats['Total_Movies'].max()} movies.")

The most prolific director is 'Alfred Hitchcock' with 14 movies.


## Iterating Through Groups

In [45]:
def group_by_decade(year):
    return f"{(int(year) // 10) * 10}s"

In [46]:
grouped_by_decade = df.groupby(df['Released_Year'].apply(group_by_decade))

ValueError: invalid literal for int() with base 10: 'PG'

In [50]:
df['Released_Year'].unique()

array(['1994', '1972', '2008', '1974', '1957', '2003', '1993', '2010',
       '1999', '2001', '1966', '2002', '1990', '1980', '1975', '2020',
       '2019', '2014', '1998', '1997', '1995', '1991', '1977', '1962',
       '1954', '1946', '2011', '2006', '2000', '1988', '1985', '1968',
       '1960', '1942', '1936', '1931', '2018', '2017', '2016', '2012',
       '2009', '2007', '1984', '1981', '1979', '1971', '1963', '1964',
       '1950', '1940', '2013', '2005', '2004', '1992', '1987', '1986',
       '1983', '1976', '1973', '1965', '1959', '1958', '1952', '1948',
       '1944', '1941', '1927', '1921', '2015', '1996', '1989', '1978',
       '1961', '1955', '1953', '1925', '1924', '1982', '1967', '1951',
       '1949', '1939', '1937', '1934', '1928', '1926', '1920', '1970',
       '1969', '1956', '1947', '1945', '1930', '1938', '1935', '1933',
       '1932', '1922', '1943', 'PG'], dtype=object)

In [51]:
delete_row_index = df[df['Released_Year'] == 'PG'].index
df.drop(delete_row_index, inplace=True)

In [52]:
df['Released_Year'].unique()

array(['1994', '1972', '2008', '1974', '1957', '2003', '1993', '2010',
       '1999', '2001', '1966', '2002', '1990', '1980', '1975', '2020',
       '2019', '2014', '1998', '1997', '1995', '1991', '1977', '1962',
       '1954', '1946', '2011', '2006', '2000', '1988', '1985', '1968',
       '1960', '1942', '1936', '1931', '2018', '2017', '2016', '2012',
       '2009', '2007', '1984', '1981', '1979', '1971', '1963', '1964',
       '1950', '1940', '2013', '2005', '2004', '1992', '1987', '1986',
       '1983', '1976', '1973', '1965', '1959', '1958', '1952', '1948',
       '1944', '1941', '1927', '1921', '2015', '1996', '1989', '1978',
       '1961', '1955', '1953', '1925', '1924', '1982', '1967', '1951',
       '1949', '1939', '1937', '1934', '1928', '1926', '1920', '1970',
       '1969', '1956', '1947', '1945', '1930', '1938', '1935', '1933',
       '1932', '1922', '1943'], dtype=object)

In [53]:
grouped_by_decade = df.groupby(df['Released_Year'].apply(group_by_decade))

In [54]:
# Calculate average rating for each decade
average_rating_by_decade = {}
for decade, group in grouped_by_decade:
    average_rating = group['IMDB_Rating'].mean()
    average_rating_by_decade[decade] = average_rating

for decade, avg_rating in average_rating_by_decade.items():
    print(f'Decade: {decade}, Average Rating: {avg_rating:.2f}')

Decade: 1920s, Average Rating: 8.13
Decade: 1930s, Average Rating: 7.97
Decade: 1940s, Average Rating: 8.03
Decade: 1950s, Average Rating: 8.06
Decade: 1960s, Average Rating: 7.97
Decade: 1970s, Average Rating: 7.97
Decade: 1980s, Average Rating: 7.95
Decade: 1990s, Average Rating: 7.96
Decade: 2000s, Average Rating: 7.90
Decade: 2010s, Average Rating: 7.92
Decade: 2020s, Average Rating: 8.13


In [55]:
grouped_by_genre = df.groupby('Genre')

total_votes_by_genre = {}
for genre, group in grouped_by_genre:
    total_votes = group['No_of_Votes'].sum()
    total_votes_by_genre[genre] = total_votes

for genre, total_votes in total_votes_by_genre.items():
    print(f'Genre: {genre}, Total Votes: {total_votes}')

Genre: Action, Total Votes: 72282412
Genre: Adventure, Total Votes: 22306966
Genre: Animation, Total Votes: 21978630
Genre: Biography, Total Votes: 24006844
Genre: Comedy, Total Votes: 27620327
Genre: Crime, Total Votes: 33533615
Genre: Drama, Total Votes: 61367304
Genre: Family, Total Votes: 551221
Genre: Fantasy, Total Votes: 146222
Genre: Film-Noir, Total Votes: 367215
Genre: Horror, Total Votes: 3742556
Genre: Mystery, Total Votes: 4203004
Genre: Thriller, Total Votes: 27733
Genre: Western, Total Votes: 1289665


## Discretization and Binning

Syntax

pd.cut(data_series, bins, labels=labels, right=False)

In [56]:
rating_bins = [0, 6.5, 7.5, 8.5, 10]
rating_labels = ['Low', 'Average', 'Good', 'Excellent']

In [57]:
# Discretize IMDB ratings
df['Rating_Category'] = pd.cut(df['IMDB_Rating'], bins=rating_bins, labels=rating_labels, right=False)

df[['Series_Title', 'IMDB_Rating', 'Rating_Category']]

Unnamed: 0,Series_Title,IMDB_Rating,Rating_Category
0,The Shawshank Redemption,9.30,Excellent
1,The Godfather,9.20,Excellent
2,The Dark Knight,9.00,Excellent
3,The Godfather: Part II,9.00,Excellent
4,12 Angry Men,9.00,Excellent
...,...,...,...
995,Breakfast at Tiffany's,7.60,Good
996,Giant,7.60,Good
997,From Here to Eternity,7.60,Good
998,Lifeboat,7.60,Good
