In [1]:
# Milestone 2

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load the dataset
df = pd.read_csv('./data/movies.csv')

# Display the first few rows to understand the structure of the dataset
print(df.head())

   movie_id                  title                             genres  \
0    912649  Venom: The Last Dance   Action|Science Fiction|Adventure   
1   1241982                Moana 2  Animation|Adventure|Family|Comedy   
2   1035048              Elevation    Action|Science Fiction|Thriller   
3    974453             Absolution              Action|Crime|Thriller   
4   1182387                  Armor              Action|Crime|Thriller   

   release_year  rating  popularity  
0        2024.0   6.700   16024.105  
1        2024.0   6.900    3921.016  
2        2024.0   6.400    3265.948  
3        2024.0   6.119    2242.793  
4        2024.0   5.600    1924.601  


In [3]:
# Step 1: Handle Missing Values
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Option 1: Drop rows with missing values (if applicable)
df.dropna(subset=['title', 'rating', 'release_year'], inplace=True)


Missing values in each column:
movie_id         0
title            0
genres          10
release_year     9
rating           0
popularity       0
dtype: int64


In [4]:
# Step 2: Normalize Numerical Data
# Normalize the 'rating' and 'popularity' columns using Min-Max Scaling
scaler = MinMaxScaler()
df[['rating', 'popularity']] = scaler.fit_transform(df[['rating', 'popularity']])

In [5]:
# Step 3: Encode Categorical Data (Genres)
# One-hot encode the 'genres' column
# First, we need to split the genres into individual genres
df['genres'] = df['genres'].apply(lambda x: x.strip('[]').replace(' ', '').split(','))

# One-hot encode the genres
genres_expanded = df['genres'].explode().unique()
for genre in genres_expanded:
    df[genre] = df['genres'].apply(lambda x: 1 if genre in x else 0)

AttributeError: 'float' object has no attribute 'strip'

In [6]:
# Step 4: Remove Duplicates
# Remove duplicate rows based on the 'title' column
df.drop_duplicates(subset=['title'], inplace=True)

In [7]:
# Check the cleaned data
print("\nCleaned dataset:")
print(df.head())

# Step 5: Save Cleaned Data
df.to_csv('./data/cleaned_movies.csv', index=False)
print("\nCleaned data saved to cleaned_movies.csv")


Cleaned dataset:
   movie_id                  title                             genres  \
0    912649  Venom: The Last Dance   Action|Science Fiction|Adventure   
1   1241982                Moana 2  Animation|Adventure|Family|Comedy   
2   1035048              Elevation    Action|Science Fiction|Thriller   
3    974453             Absolution              Action|Crime|Thriller   
4   1182387                  Armor              Action|Crime|Thriller   

   release_year  rating  popularity  
0        2024.0  0.6700    1.000000  
1        2024.0  0.6900    0.243936  
2        2024.0  0.6400    0.203015  
3        2024.0  0.6119    0.139099  
4        2024.0  0.5600    0.119222  

Cleaned data saved to cleaned_movies.csv
