# Data Pre-process for Oscar Awards

## Import Packages

In [6]:
import pandas as pd

## Step 1: Data Cleaning

- Rename column names to be more descriptive

- Change data type

- Drop unnecessary columns

- Drop duplicates

- Group awards categories 

In [30]:
# Read the CSV file
df = pd.read_csv('the_oscar_award.csv')

# Display the first few rows of the dataframe
print(df.head())

   year_film  year_ceremony  ceremony category                 name  \
0       1927           1928         1    ACTOR  Richard Barthelmess   
1       1927           1928         1    ACTOR        Emil Jannings   
2       1927           1928         1  ACTRESS       Louise Dresser   
3       1927           1928         1  ACTRESS         Janet Gaynor   
4       1927           1928         1  ACTRESS       Gloria Swanson   

               film  winner  
0         The Noose   False  
1  The Last Command    True  
2   A Ship Comes In   False  
3        7th Heaven    True  
4    Sadie Thompson   False  


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10889 entries, 0 to 10888
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year_film      10889 non-null  int64 
 1   year_ceremony  10889 non-null  int64 
 2   ceremony       10889 non-null  int64 
 3   category       10889 non-null  object
 4   name           10884 non-null  object
 5   film           10570 non-null  object
 6   winner         10889 non-null  bool  
dtypes: bool(1), int64(3), object(3)
memory usage: 521.2+ KB


In [32]:
# Renaming the columns
df = df.rename(columns={'category': 'nomination_category'})
df = df.rename(columns={'name': 'nominee_name'})
df = df.rename(columns={'film': 'film_title'})

df['nominee_name'] = df['nominee_name'].astype(str)
df['film_title'] = df['film_title'].astype(str)

# Dropping columns
df = df.drop(columns=['ceremony'])

In [33]:
# Broad category grouping with case insensitivity
def categorize_award(category):
    category = category.lower()  # Convert to lowercase for case-insensitive matching
    if 'actor' in category or 'actress' in category:
        return 'Acting'
    elif 'director' in category:
        return 'Directing'
    elif 'picture' in category:
        return 'Best Picture'
    else:
        return 'Technical/Other'

df['category_type'] = df['nomination_category'].apply(categorize_award)
df = df.drop(columns=['nomination_category'])

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10889 entries, 0 to 10888
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year_film      10889 non-null  int64 
 1   year_ceremony  10889 non-null  int64 
 2   nominee_name   10889 non-null  object
 3   film_title     10889 non-null  object
 4   winner         10889 non-null  bool  
 5   category_type  10889 non-null  object
dtypes: bool(1), int64(2), object(3)
memory usage: 436.1+ KB


## Step 2: Check Satistics

In [35]:
# Descriptive statistics for the date column
year_film_stats = {
    'count': df['year_film'].count(),
    'min': df['year_film'].min(),
    'max': df['year_film'].max(),
    'mean': df['year_film'].mean(),
    'mode': df['year_film'].mode()[0],
    'median': df['year_film'].median(),
    'sd': df['year_film'].std(),
}
print(year_film_stats)

# Count the number of null values in 'column1'
null_count = df['year_film'].isnull().sum()
print(f"Number of null values in 'year_film': {null_count}")

{'count': 10889, 'min': 1927, 'max': 2023, 'mean': 1976.6085958306548, 'mode': 1942, 'median': 1976.0, 'sd': 27.35826113240151}
Number of null values in 'year_film': 0


In [36]:
year_ceremony_stats = {
    'count': df['year_ceremony'].count(),
    'min': df['year_ceremony'].min(),
    'max': df['year_ceremony'].max(),
    'mean': df['year_ceremony'].mean(),
    'mode': df['year_ceremony'].mode()[0],
    'median': df['year_ceremony'].median(),
    'sd': df['year_ceremony'].std(),
}
print(year_ceremony_stats)

# Count the number of null values in 'column1'
null_count = df['year_ceremony'].isnull().sum()
print(f"Number of null values in 'year_ceremony': {null_count}")

{'count': 10889, 'min': 1928, 'max': 2024, 'mean': 1977.6085958306548, 'mode': 1943, 'median': 1977.0, 'sd': 27.35826113240151}
Number of null values in 'year_ceremony': 0


In [37]:
# Get distinct count
distinct_count = df['category_type'].nunique()
print(f"Distinct count: {distinct_count}") 

# Get distinct categories for the 'movie_genre' column
distinct_genres = df['category_type'].unique()

# Display the distinct genres
print(distinct_genres)

# Count the number of null values in 'column1'
null_count = df['category_type'].isnull().sum()
print(f"Number of null values in 'nomination_category': {null_count}")

Distinct count: 4
['Acting' 'Technical/Other' 'Best Picture' 'Directing']
Number of null values in 'nomination_category': 0


In [38]:
# Count the number of null values in 'column1'
null_count = df['nominee_name'].isnull().sum()
print(f"Number of null values in 'nominee_name': {null_count}")

Number of null values in 'nominee_name': 0


In [39]:
# Count the number of null values in 'column1'
null_count = df['film_title'].isnull().sum()
print(f"Number of null values in 'film_title': {null_count}")

Number of null values in 'film_title': 0


In [40]:
# Get distinct count
distinct_count = df['winner'].nunique()
print(f"Distinct count: {distinct_count}") 

# Get distinct categories for the 'movie_genre' column
distinct_genres = df['winner'].unique()

# Display the distinct genres
print(distinct_genres)

# Count the number of null values in 'column1'
null_count = df['winner'].isnull().sum()
print(f"Number of null values in 'winner': {null_count}")

Distinct count: 2
[False  True]
Number of null values in 'winner': 0


In [42]:
# Save the updated DataFrame to a CSV file
df.to_csv('Oscar_Awards_CLEAN.csv', index=False)

# Display a message confirming the save
print("Dataset saved as Oscar_Awards_CLEAN.csv")

Dataset saved as Oscar_Awards_CLEAN.csv
