In [6]:
import pandas as pd

# 1. Load dataset
df = pd.read_csv('/content/drive/MyDrive/da/netflix_titles1.csv')



# 2. Standardize all column headers: lowercase, underscores instead of spaces
df.columns = [col.lower().replace(' ', '_') for col in df.columns]
print("All column headers standardized: lowercase, spaces replaced by underscores.")

# 3. Identify and handle missing values
null_sum = df.isnull().sum().sum()
if null_sum > 0:
    print(f"Missing values found: {null_sum}. Cleaning...")
    df = df.dropna()
    print("All missing values handled (rows with nulls removed).")
else:
    print("No null values found.")

# 4. Remove duplicate rows
dup_count = df.duplicated().sum()
if dup_count > 0:
    df = df.drop_duplicates()
    print(f"Duplicates found and removed: {dup_count}")
else:
    print("No duplicate rows found.")

# 5. Standardize text columns (example: gender and country)
if 'gender' in df.columns:
    df['gender'] = df['gender'].str.lower().str.strip().replace({'m':'male','f':'female'})
    print("Gender column standardized.")
if 'country' in df.columns:
    df['country'] = df['country'].str.lower().str.strip()
    print("Country column standardized.")

# 6. Convert likely date columns to dd-mm-yyyy format
date_cols = []
for col in df.columns:
    # Only columns with 'date' or 'year' in the name
    if 'date' in col or 'year' in col:
        try:
            temp_col = pd.to_datetime(df[col], errors='coerce')
            # If any valid parse, apply formatting
            if temp_col.notna().sum() > 0:
                df[col] = temp_col.dt.strftime('%d-%m-%Y')
                date_cols.append(col)
        except Exception:
            pass
if date_cols:
    print(f"Date columns standardized in dd-mm-yyyy format: {', '.join(date_cols)}")
else:
    print("No date columns found to standardize.")

# 7. Check and fix data types (example: age as int)
if 'age' in df.columns:
    df['age'] = pd.to_numeric(df['age'], errors='coerce').astype("Int64")
    print("Age column converted to integer type.")

# 8. Save cleaned dataset
df.to_csv('cleaned_data.csv', index=False)
print("Cleaned dataset saved as 'cleaned_data.csv'.")

print("Column data types:")
print(df.dtypes)

# 9. Summary of changes
summary = {
    "missing_values_handled": null_sum > 0,
    "duplicates_removed": dup_count > 0,
    "columns_renamed": True,
    "text_standardized": ('gender' in df.columns or 'country' in df.columns),
    "date_format_standardized": (len(date_cols) > 0),
    "data_types_checked": 'age' in df.columns
}
print("Summary of changes:", summary)


All column headers standardized: lowercase, spaces replaced by underscores.
Missing values found: 4307. Cleaning...
All missing values handled (rows with nulls removed).
No duplicate rows found.
Country column standardized.
Date columns standardized in dd-mm-yyyy format: date_added, release_year
Cleaned dataset saved as 'cleaned_data.csv'.
Column data types:
show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year    object
rating          object
duration        object
listed_in       object
description     object
dtype: object
Summary of changes: {'missing_values_handled': np.True_, 'duplicates_removed': np.False_, 'columns_renamed': True, 'text_standardized': True, 'date_format_standardized': True, 'data_types_checked': False}


In [7]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/da/netflix_titles1.csv')
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# Print initial data types
print("Initial column data types:")
print(df.dtypes)

# Convert 'age' column to integer type, if it exists
if 'age' in df.columns:
    df['age'] = pd.to_numeric(df['age'], errors='coerce').astype("Int64")
    print("Age column converted to integer type.")

# Convert likely date columns to datetime, if they exist
date_cols = []
for col in df.columns:
    if 'date' in col or 'year' in col:
        try:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            date_cols.append(col)
        except Exception:
            pass
if date_cols:
    print(f"Date columns converted to datetime type: {', '.join(date_cols)}")
else:
    print("No date columns found to convert.")

# Print updated data types
print("Column data types after fixing:")
print(df.dtypes)


Initial column data types:
show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object
Date columns converted to datetime type: date_added, release_year
Column data types after fixing:
show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year    datetime64[ns]
rating                  object
duration                object
listed_in               object
description             object
dtype: object
