# Beginning Data Analysis 

In [1]:
import numpy as np
import pandas as pd
from pandas.errors import IntCastingNaNError

pd.set_option(
    "display.max_columns", 8, "display.max_rows", 10, "display.max_colwidth", 40
)

## Introduction

## Developing a data analysis routine

### How to do it...

In [2]:
college = pd.read_csv("../data/college.csv")
college.sample(random_state=42)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3649,Career Point College,San Antonio,TX,0.0,...,0.9172,0.697,20700,14977


In [3]:
college.shape

(7535, 27)

In [4]:
college.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   INSTNM              7535 non-null   object 
 1   CITY                7535 non-null   object 
 2   STABBR              7535 non-null   object 
 3   HBCU                7164 non-null   float64
 4   MENONLY             7164 non-null   float64
 5   WOMENONLY           7164 non-null   float64
 6   RELAFFIL            7535 non-null   int64  
 7   SATVRMID            1185 non-null   float64
 8   SATMTMID            1196 non-null   float64
 9   DISTANCEONLY        7164 non-null   float64
 10  UGDS                6874 non-null   float64
 11  UGDS_WHITE          6874 non-null   float64
 12  UGDS_BLACK          6874 non-null   float64
 13  UGDS_HISP           6874 non-null   float64
 14  UGDS_ASIAN          6874 non-null   float64
 15  UGDS_AIAN           6874 non-null   float64
 16  UGDS_N

In [5]:
college.describe(include=[np.number]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0000,0.00000,0.000000,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0000,0.00000,0.000000,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0000,0.00000,0.000000,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0000,0.00000,0.000000,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0000,510.00000,555.000000,765.0
...,...,...,...,...,...,...,...,...
PPTUG_EF,6853.0,0.226639,0.246470,0.0,0.0000,0.15040,0.376900,1.0
CURROPER,7535.0,0.923291,0.266146,0.0,1.0000,1.00000,1.000000,1.0
PCTPELL,6849.0,0.530643,0.225544,0.0,0.3578,0.52150,0.712900,1.0
PCTFLOAN,6849.0,0.522211,0.283616,0.0,0.3329,0.58330,0.745000,1.0


In [6]:
# college.describe(include=['O', pd.Categorical]).T
college.describe(include=[np.object_, pd.Categorical]).T

Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Alabama A & M University,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacySuppressed,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacySuppressed,1510


### How it works...

### There's more...

In [7]:
college.describe(
    include=[np.number],
    percentiles=[0.01, 0.05, 0.10, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99],
).T

Unnamed: 0,count,mean,std,min,...,90%,95%,99%,max
HBCU,7164.0,0.014238,0.118478,0.0,...,0.00000,0.00000,1.000000,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,...,0.00000,0.00000,0.000000,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,...,0.00000,0.00000,0.000000,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,...,1.00000,1.00000,1.000000,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,...,605.00000,665.00000,730.000000,765.0
...,...,...,...,...,...,...,...,...,...
PPTUG_EF,6853.0,0.226639,0.246470,0.0,...,0.60410,0.71062,0.946724,1.0
CURROPER,7535.0,0.923291,0.266146,0.0,...,1.00000,1.00000,1.000000,1.0
PCTPELL,6849.0,0.530643,0.225544,0.0,...,0.83330,0.89636,0.993908,1.0
PCTFLOAN,6849.0,0.522211,0.283616,0.0,...,0.84752,0.89792,0.986368,1.0


## Data dictionaries

In [8]:
pd.read_csv("../data/college_data_dictionary.csv")

Unnamed: 0,column_name,description
0,INSTNM,Institution Name
1,CITY,City Location
2,STABBR,State Abbreviation
3,HBCU,Historically Black College or Univer...
4,MENONLY,0/1 Men Only
...,...,...
22,PCTPELL,Percent Students with Pell grant
23,PCTFLOAN,Percent Students with federal loan
24,UG25ABV,Percent Students Older than 25
25,MD_EARN_WNE_P10,Median Earnings 10 years after enrol...


## Reducing memory by changing data types

### How to do it...

In [9]:
college = pd.read_csv("../data/college.csv")
different_cols = ["RELAFFIL", "SATMTMID", "CURROPER", "INSTNM", "STABBR"]
col2 = college.loc[:, different_cols]
col2.head()

Unnamed: 0,RELAFFIL,SATMTMID,CURROPER,INSTNM,STABBR
0,0,420.0,1,Alabama A & M University,AL
1,0,565.0,1,University of Alabama at Birmingham,AL
2,1,,1,Amridge University,AL
3,0,590.0,1,University of Alabama in Huntsville,AL
4,0,430.0,1,Alabama State University,AL


In [10]:
col2.dtypes

RELAFFIL      int64
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [11]:
# Extra steps (optional)

numeric_cols = col2.select_dtypes(include=[np.number]).columns
numeric_cols

display(
    pd.concat(
    [
        col2.describe(include=[np.number]).T,
        col2[numeric_cols].dtypes.rename("dtypes"),
        col2.select_dtypes(include=[np.number]).nunique().rename("nunique"),
    ],axis="columns"
    )
    .filter(['count', 'min', 'max', 'dtypes', 'nunique'])
    .sort_values("dtypes")

)

display(
    pd.concat(
    [
        col2.describe(include=[np.object_]).T,
        col2[col2.columns.difference(numeric_cols)].dtypes.rename("dtypes"),
    ],axis="columns"
    )
)

Unnamed: 0,count,min,max,dtypes,nunique
RELAFFIL,7535.0,0.0,1.0,int64,2
CURROPER,7535.0,0.0,1.0,int64,2
SATMTMID,1196.0,310.0,785.0,float64,167


Unnamed: 0,count,unique,top,freq,dtypes
INSTNM,7535,7535,Alabama A & M University,1,object
STABBR,7535,59,CA,773,object


In [12]:
original_mem = col2.memory_usage(deep=True)
original_mem

Index          132
RELAFFIL     60280
SATMTMID     60280
CURROPER     60280
INSTNM      599848
STABBR      384285
dtype: int64

In [13]:
col2["RELAFFIL"] = col2["RELAFFIL"].astype(np.int8)

In [14]:
col2.dtypes

RELAFFIL       int8
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [15]:
col2.memory_usage(deep=True)

Index          132
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      599848
STABBR      384285
dtype: int64

In [16]:
# check cardinality of object data types to see good candidates for casting to Categorical data type
col2.select_dtypes(include=["object"]).nunique()

INSTNM    7535
STABBR      59
dtype: int64

In [17]:
col2["STABBR"] = col2["STABBR"].astype("category")
col2.dtypes

RELAFFIL        int8
SATMTMID     float64
CURROPER       int64
INSTNM        object
STABBR      category
dtype: object

In [18]:
new_mem = col2.memory_usage(deep=True)
new_mem

Index          132
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      600307
STABBR       12648
dtype: int64

In [19]:
new_mem / original_mem

Index       1.000000
RELAFFIL    0.125000
SATMTMID    1.000000
CURROPER    1.000000
INSTNM      1.000765
STABBR      0.032913
dtype: float64

### How it works...

### There's more...

In [20]:
college.loc[0, "CURROPER"] = 10_000_000 # memory remains the same as int64 can hold this number
college.loc[0, "INSTNM"] = college.loc[0, "INSTNM"] + "a"  # memory increased by adding a single letter
college[["CURROPER", "INSTNM"]].memory_usage(deep=True)

Index          132
CURROPER     60280
INSTNM      600308
dtype: int64

In [21]:
display(college["MENONLY"].describe())
college["MENONLY"].dtype

count    7164.000000
mean        0.009213
std         0.095546
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: MENONLY, dtype: float64

dtype('float64')

In [22]:
try:
    college["MENONLY"].astype(np.int8)
except IntCastingNaNError as e:
    print("IntCastingNaNError:", e )
    print("The column likely contains `np.nan` values." )
    print("Try imputing values for instance with `.fillna` first." )
    print("Alternatively use another data type that supports missing value e.g. `'float16'` or `Int64` (pd.Int64Dtype) which uses `pd.NA` instead of `np.nan`" )

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
The column likely contains `np.nan` values.
Try imputing values for instance with `.fillna` first.
Alternatively use another data type that supports missing value e.g. `'float16'` or `Int64` (pd.Int64Dtype) which uses `pd.NA` instead of `np.nan`


In [23]:
college.assign(
    MENONLY=college["MENONLY"].astype("float16"),
    RELAFFIL=college["RELAFFIL"].astype("int8"),
)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M Universitya,Normal,AL,1.0,...,0.8284,0.1049,30300,33888
1,University of Alabama at Birmingham,Birmingham,AL,0.0,...,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,...,0.7795,0.8540,40100,23370
3,University of Alabama in Huntsville,Huntsville,AL,0.0,...,0.4596,0.2640,45500,24097
4,Alabama State University,Montgomery,AL,1.0,...,0.7554,0.1270,26600,33118.5
...,...,...,...,...,...,...,...,...,...
7530,SAE Institute of Technology San Fra...,Emeryville,CA,,...,,,,9500
7531,Rasmussen College - Overland Park,Overland Park,KS,,...,,,,21163
7532,National Personal Training Institute...,Highland Heights,OH,,...,,,,6333
7533,Bay Area Medical Academy - San Jose ...,San Jose,CA,,...,,,,PrivacySuppressed


In [24]:
# comparison of RangeIndex vs index cast as list
display(type(college.index), college.index)

try :
    college.index = pd.Int64Index(college.index)
    college.index.memory_usage()  # previously was just 80
except AttributeError as e:
    try:
        print("AttributeError", e)
        print("Deprecated since version 1.4.0: In pandas v2.0 Int64Index will be removed and NumericIndex used instead.")
        college.index = pd.NumericIndex(college.index)
        print(college.index.memory_usage()) # previously was just 132
    except AttributeError as e:
        print("\nAttributeError", e)
        college.index = list(college.index) #.astype('int64') will keep it as a RangeIndex
        display(type(college.index), college.index)
        print(college.index.memory_usage()) # previously was just 132


pandas.core.indexes.range.RangeIndex

RangeIndex(start=0, stop=7535, step=1)

AttributeError module 'pandas' has no attribute 'Int64Index'
Deprecated since version 1.4.0: In pandas v2.0 Int64Index will be removed and NumericIndex used instead.

AttributeError module 'pandas' has no attribute 'NumericIndex'


pandas.core.indexes.base.Index

Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       7525, 7526, 7527, 7528, 7529, 7530, 7531, 7532, 7533, 7534],
      dtype='int64', length=7535)

60280


## Selecting the smallest of the largest

### How to do it...

In [25]:
movie = pd.read_csv("../data/movie.csv")
movie2 = movie[["movie_title", "imdb_score", "budget"]]
movie2.head()

Unnamed: 0,movie_title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates of the Caribbean: At World's...,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark Knight Rises,8.5,250000000.0
4,Star Wars: Episode VII - The Force A...,7.1,


In [26]:
movie2.nlargest(100, "imdb_score").head()

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxer: Vengeance,9.1,17000000.0


In [27]:
(
    movie2.nlargest(100, "imdb_score")
    .nsmallest(5, "budget")
)

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


### How it works...

### There's more...

In [28]:
# It's possible to pass a list of column names in `nlargest` and `nsmallest`=> useful to break ties at the nth spot

## Selecting the largest of each group by sorting

### How to do it...

In [29]:
movie = pd.read_csv("../data/movie.csv")
movie[["movie_title", "title_year", "imdb_score"]]

Unnamed: 0,movie_title,title_year,imdb_score
0,Avatar,2009.0,7.9
1,Pirates of the Caribbean: At World's...,2007.0,7.1
2,Spectre,2015.0,6.8
3,The Dark Knight Rises,2012.0,8.5
4,Star Wars: Episode VII - The Force A...,,7.1
...,...,...,...
4911,Signed Sealed Delivered,2013.0,7.7
4912,The Following,,7.5
4913,A Plague So Pleasant,2013.0,6.3
4914,Shanghai Calling,2012.0,6.3


In [30]:
(
    movie[["movie_title", "title_year", "imdb_score"]]
        .sort_values("title_year", ascending=False)
)

Unnamed: 0,movie_title,title_year,imdb_score
3884,The Veil,2016.0,4.7
2375,My Big Fat Greek Wedding 2,2016.0,6.1
2794,Miracles from Heaven,2016.0,6.8
92,Independence Day: Resurgence,2016.0,5.5
153,Kung Fu Panda 3,2016.0,7.2
...,...,...,...
4683,Heroes,,7.7
4688,Home Movies,,8.2
4704,Revolution,,6.7
4752,Happy Valley,,8.5


In [31]:
# sort by 2 columns at once
(
    movie[["movie_title", "title_year", "imdb_score"]]
    .sort_values(["title_year", "imdb_score"], ascending=False)
)

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
4277,A Beginner's Guide to Snuff,2016.0,8.7
3798,Airlift,2016.0,8.5
27,Captain America: Civil War,2016.0,8.2
98,Godzilla Resurgence,2016.0,8.2
...,...,...,...
1391,Rush Hour,,5.8
4031,Creature,,5.0
2165,Meet the Browns,,3.5
3246,The Bold and the Beautiful,,3.5


In [32]:
# keep only the top value for each year
(
    movie[["movie_title", "title_year", "imdb_score"]]
    .sort_values(["title_year", "imdb_score"], ascending=False)
    .drop_duplicates(subset="title_year")
)

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
3745,Running Forever,2015.0,8.6
4369,Queen of the Mountains,2014.0,8.7
3935,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
3,The Dark Knight Rises,2012.0,8.5
...,...,...,...
2694,Metropolis,1927.0,8.3
4767,The Big Parade,1925.0,8.3
4833,Over the Hill to the Poorhouse,1920.0,4.8
4695,Intolerance: Love's Struggle Through...,1916.0,8.0


### How it works...

## There's more...

- Select columns
- Group
- Aggregate
- Sort years

=> FutureWarning and MultiIndex (both could be just ignored) 

In [33]:
(
    movie[['movie_title', 'title_year', 'imdb_score']]
        .groupby('title_year', as_index=False)
        .apply(lambda df: df.sort_values('imdb_score', ascending=False).head(1))
        .sort_values('title_year', ascending=False)
)

  .apply(lambda df: df.sort_values('imdb_score', ascending=False).head(1))


Unnamed: 0,Unnamed: 1,movie_title,title_year,imdb_score
90,4312,Kickboxer: Vengeance,2016.0,9.1
89,3745,Running Forever,2015.0,8.6
88,4369,Queen of the Mountains,2014.0,8.7
87,3935,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
86,3,The Dark Knight Rises,2012.0,8.5
...,...,...,...,...
4,4555,Pandora's Box,1929.0,8.0
3,2694,Metropolis,1927.0,8.3
2,4767,The Big Parade,1925.0,8.3
1,4833,Over the Hill to the Poorhouse,1920.0,4.8


Let's try to use the default `as_index=True`. But it creates an issue when we want to sort at a later stage. We will need to use `sort_index` and not `sort_values`

In [34]:
(
    movie[['movie_title', 'title_year', 'imdb_score']]
            .groupby('title_year')
            .apply(lambda df: df.sort_values('imdb_score', ascending=False).head(1))
            #.sort_values('title_year', ascending=False)
)

  .apply(lambda df: df.sort_values('imdb_score', ascending=False).head(1))


Unnamed: 0_level_0,Unnamed: 1_level_0,movie_title,title_year,imdb_score
title_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1916.0,4695,Intolerance: Love's Struggle Through...,1916.0,8.0
1920.0,4833,Over the Hill to the Poorhouse,1920.0,4.8
1925.0,4767,The Big Parade,1925.0,8.3
1927.0,2694,Metropolis,1927.0,8.3
1929.0,4555,Pandora's Box,1929.0,8.0
...,...,...,...,...
2012.0,3,The Dark Knight Rises,2012.0,8.5
2013.0,3935,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
2014.0,4369,Queen of the Mountains,2014.0,8.7
2015.0,3745,Running Forever,2015.0,8.6


In [35]:
try:
    display(
        movie[['movie_title', 'title_year', 'imdb_score']]
            .groupby('title_year')
            .apply(lambda df: df.sort_values('imdb_score', ascending=False).head(1))
            .sort_values('title_year', ascending=False)
    )
except ValueError as e :
    print("ValueError:", e)

ValueError: 'title_year' is both an index level and a column label, which is ambiguous.


  .apply(lambda df: df.sort_values('imdb_score', ascending=False).head(1))


In [36]:
(
    movie[['movie_title', 'title_year', 'imdb_score']]
        .groupby('title_year')
        .apply(lambda df: df.sort_values('imdb_score', ascending=False).head(1))
        .sort_index(level='title_year', ascending=False)
)

  .apply(lambda df: df.sort_values('imdb_score', ascending=False).head(1))


Unnamed: 0_level_0,Unnamed: 1_level_0,movie_title,title_year,imdb_score
title_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016.0,4312,Kickboxer: Vengeance,2016.0,9.1
2015.0,3745,Running Forever,2015.0,8.6
2014.0,4369,Queen of the Mountains,2014.0,8.7
2013.0,3935,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
2012.0,3,The Dark Knight Rises,2012.0,8.5
...,...,...,...,...
1929.0,4555,Pandora's Box,1929.0,8.0
1927.0,2694,Metropolis,1927.0,8.3
1925.0,4767,The Big Parade,1925.0,8.3
1920.0,4833,Over the Hill to the Poorhouse,1920.0,4.8


Going back to the first code attempt, **let's try to use get rid of the FutureWarning**

```python

    /tmp/ipykernel_1296/1709428287.py:4: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns.
    This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation.
    Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby
    to silence this warning.
    .apply(lambda df: df.sort_values("imdb_score", ascending=False).head(1))
```

See https://stackoverflow.com/questions/77969964/deprecation-warning-with-groupby-apply/78030503#78030503.

Issues appear if the grouping column is numerical and we apply math operations on it such as `np.mean`. Not the case here though, but for future reference we can:

1. explicitly select "title_year" after grouping in "[cols_selected]"
2. set the column as index

In [37]:
# let's naively use the warning silencing method
try :
    display(
        movie[['movie_title', 'title_year', 'imdb_score']]
            .groupby('title_year', as_index=False)
            .apply(lambda df: df.sort_values('imdb_score', ascending=False).head(1),
                   include_groups=False)   # <== include_groups=False
            .sort_values('title_year', ascending=False)
    )
except KeyError as e :
    print("KeyError", e)
    print("Can't find the column for sorting as it's been used for grouping    --> .sort_values('title_year', ascending=False)")

KeyError 'title_year'
Can't find the column for sorting as it's been used for grouping    --> .sort_values('title_year', ascending=False)


In [38]:
#   1 - explicitly select "title_year" after grouping to keep it as a column
(
    movie[['movie_title', 'title_year', 'imdb_score']]
        .groupby('title_year', as_index=False)[['movie_title', 'title_year', 'imdb_score']]   # <== explicit selection
        .apply(lambda df: df.sort_values('imdb_score',ascending=False)
                            .head(1))
        .sort_values('title_year', ascending=False)
)

Unnamed: 0,Unnamed: 1,movie_title,title_year,imdb_score
90,4312,Kickboxer: Vengeance,2016.0,9.1
89,3745,Running Forever,2015.0,8.6
88,4369,Queen of the Mountains,2014.0,8.7
87,3935,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
86,3,The Dark Knight Rises,2012.0,8.5
...,...,...,...,...
4,4555,Pandora's Box,1929.0,8.0
3,2694,Metropolis,1927.0,8.3
2,4767,The Big Parade,1925.0,8.3
1,4833,Over the Hill to the Poorhouse,1920.0,4.8


In [39]:
#   2 - set the column as index, then sort the index instead of the column
(
    movie[['movie_title', 'title_year', 'imdb_score']]
            .set_index("title_year")                                  # <== set as index ; will create a multilevel index  (None, title_year) : (90, 2016.0), (89, 2015.0) ...)
            .groupby(level='title_year', as_index=False)
            .apply(lambda df: df.sort_values('imdb_score', ascending=False)
                                .head(1))
            .sort_index(level='title_year', ascending=False)
            #.droplevel(0, axis='index')                             # <== drop the integer index level
)

Unnamed: 0_level_0,Unnamed: 1_level_0,movie_title,imdb_score
Unnamed: 0_level_1,title_year,Unnamed: 2_level_1,Unnamed: 3_level_1
90,2016.0,Kickboxer: Vengeance,9.1
89,2015.0,Running Forever,8.6
88,2014.0,Queen of the Mountains,8.7
87,2013.0,"Batman: The Dark Knight Returns, Part 2",8.4
86,2012.0,The Dark Knight Rises,8.5
...,...,...,...
4,1929.0,Pandora's Box,8.0
3,1927.0,Metropolis,8.3
2,1925.0,The Big Parade,8.3
1,1920.0,Over the Hill to the Poorhouse,4.8


In [40]:
(
    movie[['movie_title', 'title_year', 'imdb_score']]
            .set_index("title_year")                                  # <== set as index ; will create a multilevel index  (None, title_year) : (90, 2016.0), (89, 2015.0) ...)
            .groupby(level='title_year', as_index=False)
            .apply(lambda df: df.sort_values('imdb_score', ascending=False)
                                .head(1))
            .sort_index(level='title_year', ascending=False)
            .droplevel(0, axis='index')                             # <== drop the integer index level
)

Unnamed: 0_level_0,movie_title,imdb_score
title_year,Unnamed: 1_level_1,Unnamed: 2_level_1
2016.0,Kickboxer: Vengeance,9.1
2015.0,Running Forever,8.6
2014.0,Queen of the Mountains,8.7
2013.0,"Batman: The Dark Knight Returns, Part 2",8.4
2012.0,The Dark Knight Rises,8.5
...,...,...
1929.0,Pandora's Box,8.0
1927.0,Metropolis,8.3
1925.0,The Big Parade,8.3
1920.0,Over the Hill to the Poorhouse,4.8


In [41]:
# simpler attempt : set index and use defaultas_index=True in groupby
(
    movie[['movie_title', 'title_year', 'imdb_score']]
        .set_index("title_year")
        .groupby(level='title_year')
        .apply(lambda df: df.sort_values('imdb_score', ascending=False)
                    .head(1))
)


Unnamed: 0_level_0,Unnamed: 1_level_0,movie_title,imdb_score
title_year,title_year,Unnamed: 2_level_1,Unnamed: 3_level_1
1916.0,1916.0,Intolerance: Love's Struggle Through...,8.0
1920.0,1920.0,Over the Hill to the Poorhouse,4.8
1925.0,1925.0,The Big Parade,8.3
1927.0,1927.0,Metropolis,8.3
1929.0,1929.0,Pandora's Box,8.0
...,...,...,...
2012.0,2012.0,The Dark Knight Rises,8.5
2013.0,2013.0,"Batman: The Dark Knight Returns, Part 2",8.4
2014.0,2014.0,Queen of the Mountains,8.7
2015.0,2015.0,Running Forever,8.6


The double title_year in the index occurs because of how groupby(level="title_year") works with apply().
Here's what's happening:

1. You set title_year as the index
2. You group by that index level (named "title_year")
3. When apply() returns each group's result, pandas preserves the group key as an index level
4. But your lambda function returns a DataFrame that still has title_year as its index (from the original set_index())
5. So you end up with: title_year (from groupby) + title_year (from the original DataFrame) = two index levels both named title_year

Solution: Reset the index inside your lambda or use a different approach:


In [42]:
# Option A : reset index inside lambda
(
        movie[["movie_title", "title_year", "imdb_score"]].set_index("title_year")
        .groupby(level="title_year")[["movie_title", "imdb_score"]]
        .apply(lambda df: df.sort_values("imdb_score", ascending=False)
                    .head(1)
                    .reset_index(drop=True))
        .sort_index(level="title_year", ascending=False)
        .droplevel(1, axis='index')                             # <== drop the integer index level
)

Unnamed: 0_level_0,movie_title,imdb_score
title_year,Unnamed: 1_level_1,Unnamed: 2_level_1
2016.0,Kickboxer: Vengeance,9.1
2015.0,Running Forever,8.6
2014.0,Queen of the Mountains,8.7
2013.0,"Batman: The Dark Knight Returns, Part 2",8.4
2012.0,The Dark Knight Rises,8.5
...,...,...
1929.0,Pandora's Box,8.0
1927.0,Metropolis,8.3
1925.0,The Big Parade,8.3
1920.0,Over the Hill to the Poorhouse,4.8


In [43]:
# Option B : Cleaner approach - don't set_index first
(
    movie.groupby("title_year")[["movie_title", "imdb_score"]]
        .apply(lambda df: df.nlargest(1, "imdb_score"))
        .sort_index(level="title_year", ascending=False)
        .droplevel(1, axis='index')                             # <== drop the integer index level

)

Unnamed: 0_level_0,movie_title,imdb_score
title_year,Unnamed: 1_level_1,Unnamed: 2_level_1
2016.0,Kickboxer: Vengeance,9.1
2015.0,Running Forever,8.6
2014.0,Queen of the Mountains,8.7
2013.0,"Batman: The Dark Knight Returns, Part 2",8.4
2012.0,The Dark Knight Rises,8.5
...,...,...
1929.0,Pandora's Box,8.0
1927.0,Metropolis,8.3
1925.0,The Big Parade,8.3
1920.0,Over the Hill to the Poorhouse,4.8


In [44]:
# Option C : Even simpler with idxmax
mask = movie.groupby("title_year")["imdb_score"].idxmax()

(
    movie.loc[mask][["title_year", "movie_title", "imdb_score"]]
        .sort_values("title_year", ascending=False)
        .reset_index(drop=True)
)

Unnamed: 0,title_year,movie_title,imdb_score
0,2016.0,Kickboxer: Vengeance,9.1
1,2015.0,Running Forever,8.6
2,2014.0,Queen of the Mountains,8.7
3,2013.0,"Batman: The Dark Knight Returns, Part 2",8.4
4,2012.0,The Dark Knight Rises,8.5
...,...,...,...
86,1929.0,Pandora's Box,8.0
87,1927.0,Metropolis,8.3
88,1925.0,The Big Parade,8.3
89,1920.0,Over the Hill to the Poorhouse,4.8


In [45]:
# extend the sorting to more columns, each with their own `ascending` value
(
    movie[['movie_title', 'title_year', 'content_rating', 'budget']]
   .sort_values(['title_year', 'content_rating', 'budget'],
                ascending=[False, False, True])
   .drop_duplicates(subset=['title_year', 'content_rating'])
)

Unnamed: 0,movie_title,title_year,content_rating,budget
4026,Compadres,2016.0,R,3000000.0
4658,Fight to the Finish,2016.0,PG-13,150000.0
4661,Rodeo Girl,2016.0,PG,500000.0
3252,The Wailing,2016.0,Not Rated,
4659,Alleluia! The Devil's Carnival,2016.0,,500000.0
...,...,...,...,...
2558,Lilyhammer,,TV-MA,34000000.0
807,"Sabrina, the Teenage Witch",,TV-G,3000000.0
848,Stargate SG-1,,TV-14,1400000.0
2436,Carlos,,Not Rated,


## Replicating nlargest with sort_values

### How to do it...

In [46]:
movie = pd.read_csv("../data/movie.csv")
(
    movie[["movie_title", "imdb_score", "budget"]]
    .nlargest(100, "imdb_score")
    .nsmallest(5, "budget")
)

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


In [47]:
(
    movie[["movie_title", "imdb_score", "budget"]]
    .sort_values("imdb_score", ascending=False)
    .head(100)
)

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxer: Vengeance,9.1,17000000.0
...,...,...,...
3799,Anne of Green Gables,8.4,
3777,Requiem for a Dream,8.4,4500000.0
3935,"Batman: The Dark Knight Returns, Part 2",8.4,3500000.0
4636,The Other Dream Team,8.4,500000.0


In [48]:
# not the same result as chaining `nlargest` and `nsmallest` !!
# => there are more than 100 movies with 8.4 rating, so the tie-break rule can impact the final resutl
(
    movie[["movie_title", "imdb_score", "budget"]]
        .sort_values("imdb_score", ascending=False)
        .head(100)
        .sort_values("budget")
        .head(5)
)

Unnamed: 0,movie_title,imdb_score,budget
4815,A Charlie Brown Christmas,8.4,150000.0
4801,Children of Heaven,8.5,180000.0
4804,Butterfly Girl,8.7,180000.0
4706,12 Angry Men,8.9,350000.0
4636,The Other Dream Team,8.4,500000.0


### How it works...

In [49]:
(
    movie[["movie_title", "imdb_score", "budget"]]
        .nlargest(100, "imdb_score")
        .tail()
)

Unnamed: 0,movie_title,imdb_score,budget
4023,Oldboy,8.4,3000000.0
4163,To Kill a Mockingbird,8.4,2000000.0
4395,Reservoir Dogs,8.4,1200000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


In [50]:
(
    movie[["movie_title", "imdb_score", "budget"]]
        .sort_values("imdb_score", ascending=False)
        .head(100)
        .tail()
)

Unnamed: 0,movie_title,imdb_score,budget
3799,Anne of Green Gables,8.4,
3777,Requiem for a Dream,8.4,4500000.0
3935,"Batman: The Dark Knight Returns, Part 2",8.4,3500000.0
4636,The Other Dream Team,8.4,500000.0
2455,Aliens,8.4,18500000.0


## Calculating a trailing stop order price

### How to do it...

In [51]:
try:
    tsla = pd.read_csv("../data/yfinance_TSLA_20170101.csv",
                header=[0, 1],  # Two header rows for MultiIndex columns
                index_col=0,    # First column is the index (Date)
                parse_dates=True)  # Parse the index as dates
except FileNotFoundError as e:
    print("FileNotFoundError: ", e)

    try:
        import datetime
        import pandas_datareader.data as web
        import requests_cache
        from pandas_datareader._utils import RemoteDataError

        session = requests_cache.CachedSession(
            cache_name="cache", backend="sqlite", expire_after=datetime.timedelta(days=90)
        )

        tsla = web.DataReader("tsla", data_source="yahoo", start="2017-1-1", session=session)
        tsla.head(8)
    except RemoteDataError as e:
        print("RemoteDataError:", e)
        print("Trying with `yfinance`")

        import yfinance as yf
        tsla = yf.download("TSLA", start="2017-01-01")
        tsla.to_csv("../data/yfinance_TSLA_20170101.csv")

tsla


Price,Close,High,Low,Open,Volume
Ticker,TSLA,TSLA,TSLA,TSLA,TSLA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2017-01-03,14.466000,14.688667,14.064000,14.324000,88849500
2017-01-04,15.132667,15.200000,14.287333,14.316667,168202500
2017-01-05,15.116667,15.165333,14.796667,15.094667,88675500
2017-01-06,15.267333,15.354000,15.030000,15.128667,82918500
2017-01-09,15.418667,15.461333,15.200000,15.264667,59692500
...,...,...,...,...,...
2025-11-25,419.399994,420.480011,405.950012,414.420013,71915600
2025-11-26,426.579987,426.940002,416.890015,423.950012,63463000
2025-11-28,430.170013,432.929993,426.200012,426.589996,36252900
2025-12-01,430.140015,433.660004,425.290009,425.320007,57307600


In [52]:
tsla_close = tsla["Close"]
tsla_close

Ticker,TSLA
Date,Unnamed: 1_level_1
2017-01-03,14.466000
2017-01-04,15.132667
2017-01-05,15.116667
2017-01-06,15.267333
2017-01-09,15.418667
...,...
2025-11-25,419.399994
2025-11-26,426.579987
2025-11-28,430.170013
2025-12-01,430.140015


In [53]:
tsla_cummax = tsla_close.cummax()
tsla_cummax.head()

Ticker,TSLA
Date,Unnamed: 1_level_1
2017-01-03,14.466
2017-01-04,15.132667
2017-01-05,15.132667
2017-01-06,15.267333
2017-01-09,15.418667


In [54]:
(
    tsla["Close"].cummax()
                .mul(0.9)
                .head()
)

Ticker,TSLA
Date,Unnamed: 1_level_1
2017-01-03,13.0194
2017-01-04,13.6194
2017-01-05,13.6194
2017-01-06,13.7406
2017-01-09,13.8768


### How it works...

### There's more...