# Netflix Movies & TV Shows Analysis

**Dataset file expected:** `netflix_titles.csv`

**Tasks covered:** Data cleaning, EDA, visualizations, and advanced analysis ideas.


In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)


In [None]:
# Load dataset (place netflix_titles.csv in the same folder)
fn = "netflix_titles.csv"
try:
    df = pd.read_csv(fn)
    print("Loaded:", fn, " — shape:", df.shape)
    display(df.head())
except FileNotFoundError:
    print("File not found. Please upload 'netflix_titles.csv' to the working directory.")


## 1) Data Cleaning
- Inspect missing values
- Fix datatypes
- Split multi-valued columns (e.g., `country`, `cast`, `listed_in`) into lists for analysis


In [None]:
# Basic cleaning steps
def basic_clean(df):
    df = df.copy()
    # Standardize column names
    df.columns = [c.strip() for c in df.columns]
    # Convert date_added to datetime if present
    if 'date_added' in df.columns:
        df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
        df['year_added'] = df['date_added'].dt.year
    # Extract release year if 'release_year' exists (it should)
    if 'release_year' in df.columns:
        df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce').astype('Int64')
    # Fill missing descriptions with empty string
    if 'description' in df.columns:
        df['description'] = df['description'].fillna('')
    return df

if 'df' in globals():
    df = basic_clean(df)
    display(df.info())
    display(df.isnull().sum())


## 2) Exploratory Data Analysis (EDA)
- Distribution of content types (Movie vs TV Show)
- Top genres
- Release year trends
- Country distribution


In [None]:
if 'df' in globals():
    # Content type counts
    if 'type' in df.columns:
        print(df['type'].value_counts())
        df['type'].value_counts().plot(kind='bar')
        plt.title('Content Type Counts')
        plt.xlabel('Type')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.show()


In [None]:
# Top genres (listed_in column)
if 'df' in globals() and 'listed_in' in df.columns:
    # explode genres
    s = df['listed_in'].dropna().str.split(',').apply(lambda x: [t.strip() for t in x])
    all_genres = pd.Series([g for sub in s for g in sub])
    top_genres = all_genres.value_counts().head(20)
    display(top_genres)
    top_genres.head(10).plot(kind='bar')
    plt.title('Top Genres')
    plt.tight_layout()
    plt.show()


## 3) Advanced Analysis Ideas
- Trends over time (content added per year)
- Popular directors/actors
- Regional preferences (country-level)
- Recommendation baseline using content-based similarity on descriptions or genres


In [None]:
# Example: Content added per year (if date_added available)
if 'df' in globals() and 'year_added' in df.columns:
    yearly = df.groupby('year_added').size().sort_index()
    display(yearly.tail(15))
    yearly.plot()
    plt.title('Content Added Per Year')
    plt.xlabel('Year Added')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()


In [None]:
# Simple content-based recommendation skeleton using descriptions
if 'df' in globals() and 'description' in df.columns:
    sample = df[['show_id','title','description']].dropna(subset=['description']).reset_index(drop=True).head(2000)
    vect = CountVectorizer(max_features=2000, stop_words='english')
    X = vect.fit_transform(sample['description'])
    # compute cosine similarity for a toy example
    from sklearn.metrics.pairwise import cosine_similarity
    sim = cosine_similarity(X[:50], X[:50])  # small sample for quick compute
    print("Similarity matrix shape:", sim.shape)
    # You can expand this to full dataset if memory allows


## 4) Deliverables & Suggestions
- Save cleaned data to `netflix_cleaned.csv`
- Create visualizations as PNGs and include them in the PDF report
- Optional: Build a small Flask app to serve recommendations
