In [1]:
%matplotlib inline

In [2]:
import pandas as pd 
import matplotlib.pyplot as plt

In [3]:
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)

In [4]:
#1 : je retire les espaces et l'index
df_female = pd.read_csv('./csv/oscar_age_female_raw.csv', skipinitialspace=True, index_col='Index')

In [5]:
for colonne in df_female.select_dtypes(include='object').columns:
    df_female[colonne] = df_female[colonne].str.replace(r'[^\w\s]', '')

In [6]:
df_female[:4]

Unnamed: 0_level_0,Year,Age,Name,Movie
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1928,22,Janet Gaynor,"Seventh Heaven, Street Angel and Sunrise: A So..."
2,1929,37,Mary Pickford,Coquette
3,1930,28,Norma Shearer,The Divorcee\t
4,1931,63,Marie Dressler,Min and Bill


In [7]:
 # 3 : Je modifie le format de l'année 
df_female['Year'] = pd.to_datetime(df_female['Year'], format='%Y')

In [8]:
# #2 : Voici une commande qui vas me permettre de rectifier le probleme de tabulation 
# df_female = pd.read_csv('./csv/oscar_age_female_raw.csv', sep=",\s*", engine='python')

In [9]:
df_female['Year']

Index
1    1928-01-01
2    1929-01-01
3    1930-01-01
4    1931-01-01
5    1932-01-01
        ...    
85   2012-01-01
86   2013-01-01
87   2014-01-01
88   2015-01-01
89   2016-01-01
Name: Year, Length: 89, dtype: datetime64[ns]

In [10]:
# je cherche maintenant a rectifier le decalage d'année 
df_female['Year'] = df_female['Year']- pd.DateOffset(years=1)
df_female['Year']=df_female['Year'].dt.strftime('%Y')

In [11]:
# df_female['year']

In [12]:
df_female

Unnamed: 0_level_0,Year,Age,Name,Movie
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1927,22,Janet Gaynor,"Seventh Heaven, Street Angel and Sunrise: A So..."
2,1928,37,Mary Pickford,Coquette
3,1929,28,Norma Shearer,The Divorcee\t
4,1930,63,Marie Dressler,Min and Bill
5,1931,32,Helen Hayes,The Sin of Madelon Claudet\t
...,...,...,...,...
85,2011,62,Meryl Streep,The Iron Lady
86,2012,22,Jennifer Lawrence,Silver Linings Playbook
87,2013,44,Cate Blanchett,Blue Jasmine
88,2014,54,Julianne Moore,Still Alice


In [13]:
print(df_female.columns)


Index(['Year', 'Age', 'Name', 'Movie'], dtype='object')


In [14]:
# rectifier le probleme de tabulation 
df_female['Movie'] = df_female['Movie'].str.replace('\t', '')

In [15]:
df_female

Unnamed: 0_level_0,Year,Age,Name,Movie
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1927,22,Janet Gaynor,"Seventh Heaven, Street Angel and Sunrise: A So..."
2,1928,37,Mary Pickford,Coquette
3,1929,28,Norma Shearer,The Divorcee
4,1930,63,Marie Dressler,Min and Bill
5,1931,32,Helen Hayes,The Sin of Madelon Claudet
...,...,...,...,...
85,2011,62,Meryl Streep,The Iron Lady
86,2012,22,Jennifer Lawrence,Silver Linings Playbook
87,2013,44,Cate Blanchett,Blue Jasmine
88,2014,54,Julianne Moore,Still Alice


In [16]:
latest_female_oscars = pd.DataFrame({
    'Year': [2016, 2017, 2018, 2019, 2020, 2021, 2022],
    'Age': [28, 60, 44, 50, 63, 44, 60],
    'Name': ['Emma Stone', 'Frances McDormand', 'Olivia Colman', 'Renée Zellweger', 'Frances McDormand', 'Jessica Chastain', 'Michelle Yeoh'],
    'Movie': ['La La Land', 'Three Billboards Outside Ebbing, Missouri', 'The Favourite', 'Judy', 'Nomadland', 'The Eyes of Tammy Faye', 'Everything Everywhere All at Once']
})
df_combined_female = pd.concat([df_female, latest_female_oscars], ignore_index=True)

In [17]:
df_combined_female

Unnamed: 0,Year,Age,Name,Movie
0,1927,22,Janet Gaynor,"Seventh Heaven, Street Angel and Sunrise: A So..."
1,1928,37,Mary Pickford,Coquette
2,1929,28,Norma Shearer,The Divorcee
3,1930,63,Marie Dressler,Min and Bill
4,1931,32,Helen Hayes,The Sin of Madelon Claudet
...,...,...,...,...
91,2018,44,Olivia Colman,The Favourite
92,2019,50,Renée Zellweger,Judy
93,2020,63,Frances McDormand,Nomadland
94,2021,44,Jessica Chastain,The Eyes of Tammy Faye


In [18]:
def lowercase_first_letter(s):
    return s[0].lower() + s[1:]
df_combined_female = df_combined_female.rename(columns=lambda x: lowercase_first_letter(x))

# Afficher le DataFrame avec les noms de colonnes en minuscules pour la première lettre
df_combined_female

Unnamed: 0,year,age,name,movie
0,1927,22,Janet Gaynor,"Seventh Heaven, Street Angel and Sunrise: A So..."
1,1928,37,Mary Pickford,Coquette
2,1929,28,Norma Shearer,The Divorcee
3,1930,63,Marie Dressler,Min and Bill
4,1931,32,Helen Hayes,The Sin of Madelon Claudet
...,...,...,...,...
91,2018,44,Olivia Colman,The Favourite
92,2019,50,Renée Zellweger,Judy
93,2020,63,Frances McDormand,Nomadland
94,2021,44,Jessica Chastain,The Eyes of Tammy Faye


In [19]:
average_age = df_combined_female['age'].mean()
print("Moyenne d'âge :", average_age)


Moyenne d'âge : 37.125


In [20]:
df_combined_female['movie']

0     Seventh Heaven, Street Angel and Sunrise: A So...
1                                              Coquette
2                                          The Divorcee
3                                          Min and Bill
4                            The Sin of Madelon Claudet
                            ...                        
91                                        The Favourite
92                                                 Judy
93                                            Nomadland
94                               The Eyes of Tammy Faye
95                    Everything Everywhere All at Once
Name: movie, Length: 96, dtype: object

In [21]:
df_combined_female.columns
df_combined_female.columns = df_combined_female.columns.str.lower()
df_combined_female.describe()

Unnamed: 0,age
count,96.0
mean,37.125
std,12.263982
min,21.0
25%,28.75
50%,33.0
75%,41.25
max,80.0


In [22]:
df_male  = pd.read_csv('./csv/oscar_age_male_raw.csv', skipinitialspace=True, index_col='Index')

In [23]:
df_male[:4]

Unnamed: 0_level_0,Year,Age,Name,Movie
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1928,44,Emil Jannings,"The Last Command, The Way of All Flesh"
2,1929,41,Warner Baxter,In Old Arizona
3,1930,62,George Arliss,Disraeli
4,1931,53,Lionel Barrymore,A Free Soul


In [24]:
# rectifier le probleme de tabulation 
df_male['Movie'] = df_male['Movie'].str.replace('\t', '')

In [25]:
def lowercase_first_letter2(s):
    return s[0].lower() + s[1:]
df_male = df_male.rename(columns=lambda x: lowercase_first_letter(x))

# Afficher le DataFrame avec les noms de colonnes en minuscules pour la première lettre

In [26]:
df_male

Unnamed: 0_level_0,year,age,name,movie
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1928,44,Emil Jannings,"The Last Command, The Way of All Flesh"
2,1929,41,Warner Baxter,In Old Arizona
3,1930,62,George Arliss,Disraeli
4,1931,53,Lionel Barrymore,A Free Soul
5,1932,47,Wallace Beery,The Champ
...,...,...,...,...
85,2012,39,Jean Dujardin,The Artist
86,2013,55,Daniel Day-Lewis,Lincoln
87,2014,44,Matthew McConaughey,Dallas Buyers Club
88,2015,33,Eddie Redmayne,The Theory of Everything


In [27]:
 # 3 : Je modifie le format de l'année 
df_male['year'] = pd.to_datetime(df_male['year'], format='%Y')

In [28]:
# je cherche maintenant a rectifier le decalage d'année 
df_male['year'] = df_male['year']- pd.DateOffset(years=1)
df_male['year']=df_male['year'].dt.strftime('%Y')

In [29]:
df_male

Unnamed: 0_level_0,year,age,name,movie
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1927,44,Emil Jannings,"The Last Command, The Way of All Flesh"
2,1928,41,Warner Baxter,In Old Arizona
3,1929,62,George Arliss,Disraeli
4,1930,53,Lionel Barrymore,A Free Soul
5,1931,47,Wallace Beery,The Champ
...,...,...,...,...
85,2011,39,Jean Dujardin,The Artist
86,2012,55,Daniel Day-Lewis,Lincoln
87,2013,44,Matthew McConaughey,Dallas Buyers Club
88,2014,33,Eddie Redmayne,The Theory of Everything


In [30]:
latest_male_oscars = pd.DataFrame({
    'year': [2016, 2017, 2018, 2019, 2020, 2021, 2022],
    'age': [41, 59, 37, 45, 83, 53, 54],
    'name': ['Casey Affleck', 'Gary Oldman', 'Rami Malek', 'Joaquin Phoenix', 'Anthony Hopkins', 'Will Smith', 'Brendan Fraser'],
    'movie': ['Manchester by the Sea', 'Darkest Hour', 'Bohemian Rhapsody', 'Joker', 'The Father', 'King Richard', 'The Whale']
})
df_combined_male = pd.concat([df_male, latest_male_oscars], ignore_index=True)

In [31]:
average_age = df_combined_male['age'].mean()
print("Moyenne d'âge :", average_age)


Moyenne d'âge : 44.552083333333336


In [32]:
for colonne in df_combined_male.select_dtypes(include='object').columns:
    df_combined_male[colonne] = df_combined_male[colonne].str.replace(r'[^\w\s]', '')

In [33]:
df_combined_male

Unnamed: 0,year,age,name,movie
0,1927,44,Emil Jannings,"The Last Command, The Way of All Flesh"
1,1928,41,Warner Baxter,In Old Arizona
2,1929,62,George Arliss,Disraeli
3,1930,53,Lionel Barrymore,A Free Soul
4,1931,47,Wallace Beery,The Champ
...,...,...,...,...
91,,37,Rami Malek,Bohemian Rhapsody
92,,45,Joaquin Phoenix,Joker
93,,83,Anthony Hopkins,The Father
94,,53,Will Smith,King Richard


In [34]:
#Super maintenant on enregistre la data nettoyer sur le meme chemins que les anciennes 
df_combined_male.to_csv('./csv/oscar_age_male_processed.csv', index=False)
df_combined_female.to_csv('./csv/oscar_age_female_processed.csv', index=False)

In [35]:
df_combined_male['gender']='m'

In [36]:
df_combined_male

Unnamed: 0,year,age,name,movie,gender
0,1927,44,Emil Jannings,"The Last Command, The Way of All Flesh",m
1,1928,41,Warner Baxter,In Old Arizona,m
2,1929,62,George Arliss,Disraeli,m
3,1930,53,Lionel Barrymore,A Free Soul,m
4,1931,47,Wallace Beery,The Champ,m
...,...,...,...,...,...
91,,37,Rami Malek,Bohemian Rhapsody,m
92,,45,Joaquin Phoenix,Joker,m
93,,83,Anthony Hopkins,The Father,m
94,,53,Will Smith,King Richard,m


In [37]:
df_combined_female['gender']='f'

In [38]:
df_combined_female

Unnamed: 0,year,age,name,movie,gender
0,1927,22,Janet Gaynor,"Seventh Heaven, Street Angel and Sunrise: A So...",f
1,1928,37,Mary Pickford,Coquette,f
2,1929,28,Norma Shearer,The Divorcee,f
3,1930,63,Marie Dressler,Min and Bill,f
4,1931,32,Helen Hayes,The Sin of Madelon Claudet,f
...,...,...,...,...,...
91,2018,44,Olivia Colman,The Favourite,f
92,2019,50,Renée Zellweger,Judy,f
93,2020,63,Frances McDormand,Nomadland,f
94,2021,44,Jessica Chastain,The Eyes of Tammy Faye,f


In [39]:
for colonne in df_female.select_dtypes(include='object').columns:
    df_female[colonne] = df_female[colonne].str.replace(r'[^\w\s]', '')

In [40]:
df_combined_mf = pd.concat([df_combined_female, df_combined_male], ignore_index=True)


In [42]:
df_combined_mf[

Unnamed: 0,year,age,name,movie,gender
0,1927,22,Janet Gaynor,"Seventh Heaven, Street Angel and Sunrise: A So...",f
1,1928,37,Mary Pickford,Coquette,f
2,1929,28,Norma Shearer,The Divorcee,f
3,1930,63,Marie Dressler,Min and Bill,f
4,1931,32,Helen Hayes,The Sin of Madelon Claudet,f
...,...,...,...,...,...
187,,37,Rami Malek,Bohemian Rhapsody,m
188,,45,Joaquin Phoenix,Joker,m
189,,83,Anthony Hopkins,The Father,m
190,,53,Will Smith,King Richard,m


In [41]:
df_combined_mf.to_csv('./csv/oscar_age_processed.csv', index=False)