# 📊 Data Cleaning & Preprocessing – Internship Task 1

In [None]:
# 📦 Step 1: Import Required Libraries
import pandas as pd
import numpy as np

In [None]:
# 📂 Step 2: Load the Dataset
# 👉 Replace the path and filename below with your actual file name
file_path = r"C:/Users/ron33/Downloads/netflix_titles.csv"
df = pd.read_csv(file_path)

# 👀 Preview the Data
df.head()

In [None]:
# 🔍 Step 3: Explore the Data
print("Shape of dataset:", df.shape)
print("\nColumn names:\n", df.columns)
print("\nInfo:\n")
df.info()

In [None]:
# ❌ Step 4: Check for Missing Values
missing = df.isnull().sum()
print("Missing values in each column:\n", missing)

In [None]:
# 🧹 Step 5: Handle Missing Values

# Example: Fill missing 'director' with 'Unknown'
df['director'] = df['director'].fillna('Unknown')

# Drop rows where essential columns are missing
df = df.dropna(subset=['title'])

In [None]:
# 🔁 Step 6: Remove Duplicates
duplicates = df.duplicated().sum()
print("Number of duplicate rows:", duplicates)

# Remove them
df = df.drop_duplicates()

In [None]:
# 🛠 Step 7: Standardize Columns and Values

# Rename column headers to lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Example: Standardize 'type' column values (just for demo)
df['type'] = df['type'].str.strip().str.title()

In [None]:
# 🗓 Step 8: Convert Date Columns
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')  # handle invalid dates

In [None]:
# ✅ Step 9: Check Data Types
print(df.dtypes)

In [None]:
# 💾 Step 10: Save Cleaned Data
df.to_csv("cleaned_netflix_data.csv", index=False)
print("Cleaned dataset saved successfully!")