In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Load dataset
def load_data():
    import os
    file_path = "C:/Users/arpit/Documents/netflix_titles.csv"
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Dataset file not found at {file_path}. Please verify the path.")
    data = pd.read_csv(file_path)
    return data

# Data Cleaning and Preprocessing
def clean_data(data):
    print("Missing Values Percentage:")
    print(data.isnull().mean() * 100)
    
    print(f"\nInitial rows: {len(data)}")
    
    # Fix erroneous ratings
    erroneous_ratings = ['66 min', '74 min', '84 min']
    for rating in erroneous_ratings:
        if rating in data['rating'].values:
            print(f"\nCorrecting erroneous rating '{rating}'")
            data.loc[data['rating'] == rating, 'duration'] = rating
            data.loc[data['rating'] == rating, 'rating'] = data['rating'].mode()[0]
    
    # Exclude exclusive ratings
    exclusive_ratings = ['G', 'NC-17', 'PG', 'PG-13', 'UR']
    print(f"\nExcluding exclusive ratings: {exclusive_ratings}")
    data = data[~data['rating'].isin(exclusive_ratings)]
    print(f"Rows after excluding ratings: {len(data)}")
    
    # Impute missing values for categorical columns
    cat_cols = ['director', 'cast', 'country', 'rating']
    for col in cat_cols:
        data[col].fillna(data[col].mode()[0], inplace=True)
    
    # Handle missing duration values
    print("\nChecking duration column for non-string or missing values:")
    invalid_duration = data[data['duration'].isna() | ~data['duration'].apply(lambda x: isinstance(x, str))]
    print(f"Rows with invalid duration (NaN or non-string): {len(invalid_duration)}")
    if len(invalid_duration) > 0:
        print("Invalid duration values:")
        print(invalid_duration[['show_id', 'type', 'duration']])
    
    data['duration'] = data.apply(
        lambda x: "90 min" if pd.isna(x['duration']) and x['type'] == 'Movie' 
        else "1 Season" if pd.isna(x['duration']) and x['type'] == 'TV Show' 
        else x['duration'], 
        axis=1
    )
    
    # Validate movie durations
    data['movie_duration_min'] = data.apply(
        lambda x: float(x['duration'].split()[0]) if isinstance(x['duration'], str) and 'min' in x['duration'] and x['type'] == 'Movie' else 0, 
        axis=1
    )
    invalid_movies = data[(data['type'] == 'Movie') & (data['movie_duration_min'] > 0) & ((data['movie_duration_min'] < 15) | (data['movie_duration_min'] > 300))]
    print(f"\nRows with unrealistic movie durations (<15 min or >300 min): {len(invalid_movies)}")
    if len(invalid_movies) > 0:
        print("Unrealistic movie durations:")
        print(invalid_movies[['show_id', 'type', 'duration', 'movie_duration_min']])
    
    # Drop unrealistic durations
    data = data[~((data['type'] == 'Movie') & (data['movie_duration_min'] > 0) & ((data['movie_duration_min'] < 15) | (data['movie_duration_min'] > 300)))]
    print(f"Rows after dropping unrealistic durations: {len(data)}")
    
    # Validate post-drop
    invalid_movies_post_drop = data[(data['type'] == 'Movie') & (data['movie_duration_min'] > 0) & ((data['movie_duration_min'] < 15) | (data['movie_duration_min'] > 300))]
    print(f"Rows with unrealistic movie durations (post-drop): {len(invalid_movies_post_drop)}")
    
    # Cap outliers in movie_duration_min
    Q1 = data[data['movie_duration_min'] > 0]['movie_duration_min'].quantile(0.25)
    Q3 = data[data['movie_duration_min'] > 0]['movie_duration_min'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = max(15, Q1 - 1.5 * IQR)
    upper_bound = min(300, Q3 + 1.5 * IQR)
    data.loc[(data['type'] == 'Movie') & (data['movie_duration_min'] > 0) & (data['movie_duration_min'] < lower_bound), 'movie_duration_min'] = lower_bound
    data.loc[(data['type'] == 'Movie') & (data['movie_duration_min'] > 0) & (data['movie_duration_min'] > upper_bound), 'movie_duration_min'] = upper_bound
    
    # Recompute outliers
    duration_outliers = data[(data['movie_duration_min'] > 0) & ((data['movie_duration_min'] < lower_bound) | (data['movie_duration_min'] > upper_bound))]['movie_duration_min']
    print(f"\nOutliers in movie_duration_min (post-capping): {len(duration_outliers)}")
    
    # Impute zeros with median
    movie_median = data[data['movie_duration_min'] > 0]['movie_duration_min'].median()
    data['movie_duration_min'] = data['movie_duration_min'].replace(0, movie_median)
    
    # Clean date_added
    data['date_added'] = data['date_added'].str.strip()
    data['date_added'] = pd.to_datetime(data['date_added'], errors='coerce', format='mixed')
    print(f"\nRows with invalid date_added (NaT): {data['date_added'].isna().sum()}")
    data.dropna(subset=['date_added'], inplace=True)
    print(f"Rows after dropping NaT: {len(data)}")
    
    # Extract year and month
    data['year_added'] = data['date_added'].dt.year
    data['month_added'] = data['date_added'].dt.month
    
    # Outlier detection
    numerical_cols = ['release_year', 'year_added']
    for col in numerical_cols:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)][col]
        print(f"\nOutliers in {col}: {len(outliers)}")
        print(outliers.describe())
        if col == 'year_added':
            data[col] = data[col].clip(lower=lower_bound, upper=upper_bound)
    
    return data

# Feature Engineering and Selection
def feature_engineering(data):
    global preprocessor, le  # Make preprocessor and label encoder global for saving
    le = LabelEncoder()
    data['type_encoded'] = le.fit_transform(data['type'])
    
    # Genre features
    data['num_genres'] = data['listed_in'].apply(lambda x: len(x.split(',')))
    data['is_international'] = data['country'].apply(lambda x: 1 if 'United States' not in x else 0)
    data['is_drama'] = data['listed_in'].apply(lambda x: 1 if 'Dramas' in x else 0)
    data['is_comedy'] = data['listed_in'].apply(lambda x: 1 if 'Comedies' in x else 0)
    data['is_documentary'] = data['listed_in'].apply(lambda x: 1 if 'Documentaries' in x else 0)
    data['is_action'] = data['listed_in'].apply(lambda x: 1 if 'Action' in x else 0)
    data['is_horror'] = data['listed_in'].apply(lambda x: 1 if 'Horror' in x else 0)
    data['is_sci_fi'] = data['listed_in'].apply(lambda x: 1 if 'Sci-Fi' in x else 0)
    data['is_romance'] = data['listed_in'].apply(lambda x: 1 if 'Romantic' in x else 0)
    
    # Preprocessing pipeline
    numerical_features = ['release_year', 'year_added', 'month_added', 'num_genres']
    categorical_features = ['rating', 'is_international', 'is_drama', 'is_comedy', 'is_documentary', 'is_action', 'is_horror', 'is_sci_fi', 'is_romance']
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
        ])
    
    X = preprocessor.fit_transform(data)
    y = data['type_encoded']
    
    # Get feature names
    num_features = numerical_features
    cat_features = []
    for feature in categorical_features:
        if feature == 'rating':
            cat_features.extend([f"rating_{cat}" for cat in preprocessor.named_transformers_['cat'].categories_[0][1:]])
        else:
            cat_features.extend([f"{feature}_{cat}" for cat in preprocessor.named_transformers_['cat'].categories_[categorical_features.index(feature)][1:]])
    features = num_features + cat_features
    
    # Check correlations
    print("\nCorrelation with target (type_encoded):")
    X_df = pd.DataFrame(X, columns=features)
    for feature in features:
        corr = X_df[feature].corr(y)
        print(f"{feature}: {corr:.4f}")
        if abs(corr) > 0.8:
            print(f"Warning: High correlation ({feature}) may indicate leakage.")
    
    return X, y, features

# Model Building and Evaluation
def build_models(X, y, features):
    # Create holdout set
    X_temp, X_holdout, y_temp, y_holdout = train_test_split(X, y, test_size=0.1, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X_temp, y_temp, test_size=0.2222, random_state=42)
    
    # Try SMOTE with sampling_strategy=0.8, fallback to 1.0 if it fails
    try:
        smote = SMOTE(random_state=42, sampling_strategy=0.8)
        X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
    except ValueError as e:
        print(f"SMOTE failed with sampling_strategy=0.8: {e}")
        print("Retrying with sampling_strategy=1.0")
        smote = SMOTE(random_state=42, sampling_strategy=1.0)
        X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
    
    models = {
        'Logistic Regression': LogisticRegression(random_state=42),
        'Naive Bayes': GaussianNB(),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'AdaBoost': AdaBoostClassifier(random_state=42),
        'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        'SVM': SVC(probability=True, random_state=42),
        'KNN': KNeighborsClassifier()
    }
    
    results = []
    
    for name, model in models.items():
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        y_pred_holdout = model.predict(X_holdout)
        train_probs = model.predict_proba(X_train)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_train)
        test_probs = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_test)
        holdout_probs = model.predict_proba(X_holdout)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_holdout)
        
        train_metrics = {
            'Precision': precision_score(y_train, y_pred_train),
            'Recall': recall_score(y_train, y_pred_train),
            'F1': f1_score(y_train, y_pred_train),
            'ROC-AUC': roc_auc_score(y_train, train_probs)
        }
        test_metrics = {
            'Precision': precision_score(y_test, y_pred_test),
            'Recall': recall_score(y_test, y_pred_test),
            'F1': f1_score(y_test, y_pred_test),
            'ROC-AUC': roc_auc_score(y_test, test_probs),
            'CV F1 Mean': cv_scores.mean(),
            'CV F1 Std': cv_scores.std()
        }
        holdout_metrics = {
            'Precision': precision_score(y_holdout, y_pred_holdout),
            'Recall': recall_score(y_holdout, y_pred_holdout),
            'F1': f1_score(y_holdout, y_pred_holdout),
            'ROC-AUC': roc_auc_score(y_holdout, holdout_probs)
        }
        
        cv_scores_bal = cross_val_score(model, X_train_bal, y_train_bal, cv=cv, scoring='f1')
        model.fit(X_train_bal, y_train_bal)
        y_pred_train_bal = model.predict(X_train_bal)
        y_pred_test_bal = model.predict(X_test)
        y_pred_holdout_bal = model.predict(X_holdout)
        train_probs_bal = model.predict_proba(X_train_bal)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_train_bal)
        test_probs_bal = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_test)
        holdout_probs_bal = model.predict_proba(X_holdout)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_holdout)
        
        train_metrics_bal = {
            'Precision': precision_score(y_train_bal, y_pred_train_bal),
            'Recall': recall_score(y_train_bal, y_pred_train_bal),
            'F1': f1_score(y_train_bal, y_pred_train_bal),
            'ROC-AUC': roc_auc_score(y_train_bal, train_probs_bal)
        }
        test_metrics_bal = {
            'Precision': precision_score(y_test, y_pred_test_bal),
            'Recall': recall_score(y_test, y_pred_test_bal),
            'F1': f1_score(y_test, y_pred_test_bal),
            'ROC-AUC': roc_auc_score(y_test, test_probs_bal),
            'CV F1 Mean': cv_scores_bal.mean(),
            'CV F1 Std': cv_scores_bal.std()
        }
        holdout_metrics_bal = {
            'Precision': precision_score(y_holdout, y_pred_holdout_bal),
            'Recall': recall_score(y_holdout, y_pred_holdout_bal),
            'F1': f1_score(y_holdout, y_pred_holdout_bal),
            'ROC-AUC': roc_auc_score(y_holdout, holdout_probs_bal)
        }
        
        overfitting = 'Yes' if (train_metrics['F1'] - test_metrics['F1'] > 0.1) else 'No'
        overfitting_bal = 'Yes' if (train_metrics_bal['F1'] - test_metrics_bal['F1'] > 0.1) else 'No'
        
        results.append({
            'Model': name,
            'Data': 'Original',
            'Train Precision': train_metrics['Precision'],
            'Train Recall': train_metrics['Recall'],
            'Train F1': train_metrics['F1'],
            'Train ROC-AUC': train_metrics['ROC-AUC'],
            'Test Precision': test_metrics['Precision'],
            'Test Recall': test_metrics['Recall'],
            'Test F1': test_metrics['F1'],
            'Test ROC-AUC': test_metrics['ROC-AUC'],
            'Holdout Precision': holdout_metrics['Precision'],
            'Holdout Recall': holdout_metrics['Recall'],
            'Holdout F1': holdout_metrics['F1'],
            'Holdout ROC-AUC': holdout_metrics['ROC-AUC'],
            'CV F1 Mean': test_metrics['CV F1 Mean'],
            'CV F1 Std': test_metrics['CV F1 Std'],
            'Overfitting': overfitting
        })
        
        results.append({
            'Model': name,
            'Data': 'Balanced',
            'Train Precision': train_metrics_bal['Precision'],
            'Train Recall': train_metrics_bal['Recall'],
            'Train F1': train_metrics_bal['F1'],
            'Train ROC-AUC': train_metrics_bal['ROC-AUC'],
            'Test Precision': test_metrics_bal['Precision'],
            'Test Recall': test_metrics_bal['Recall'],
            'Test F1': test_metrics_bal['F1'],
            'Test ROC-AUC': test_metrics_bal['ROC-AUC'],
            'Holdout Precision': holdout_metrics_bal['Precision'],
            'Holdout Recall': holdout_metrics_bal['Recall'],
            'Holdout F1': holdout_metrics_bal['F1'],
            'Holdout ROC-AUC': holdout_metrics_bal['ROC-AUC'],
            'CV F1 Mean': test_metrics_bal['CV F1 Mean'],
            'CV F1 Std': test_metrics_bal['CV F1 Std'],
            'Overfitting': overfitting_bal
        })
    
    results_df = pd.DataFrame(results)
    print("\nModel Performance Summary:")
    print(results_df)
    
    best_model_row = results_df.loc[results_df['Test F1'].idxmax()]
    print(f"\nBest Model: {best_model_row['Model']} ({best_model_row['Data']}) with Test F1: {best_model_row['Test F1']:.4f}, Holdout F1: {best_model_row['Holdout F1']:.4f}")
    
    return results_df, best_model_row, X_train, X_test, y_train, y_test, X_train_bal, y_train_bal, X_holdout, y_holdout

# Model Tuning
def tune_model(best_model_row, X_train, X_test, y_train, y_test, X_train_bal, y_train_bal, X_holdout, y_holdout):
    model_name = best_model_row['Model']
    data_type = best_model_row['Data']
    
    param_grid = {
        'Logistic Regression': {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']},
        'Naive Bayes': {},
        'Decision Tree': {'max_depth': [5, 10, None], 'min_samples_split': [2, 5, 10]},
        'Random Forest': {'n_estimators': [100, 200], 'max_depth': [10, 20, None]},
        'AdaBoost': {'n_estimators': [50, 100], 'learning_rate': [0.1, 1.0]},
        'XGBoost': {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.05, 0.1, 0.3]
        },
        'SVM': {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},
        'KNN': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
    }
    
    model_dict = {
        'Logistic Regression': LogisticRegression(random_state=42),
        'Naive Bayes': GaussianNB(),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'AdaBoost': AdaBoostClassifier(random_state=42),
        'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        'SVM': SVC(probability=True, random_state=42),
        'KNN': KNeighborsClassifier()
    }
    
    model = model_dict[model_name]
    
    if param_grid[model_name]:
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        grid_search = GridSearchCV(model, param_grid[model_name], cv=cv, scoring='recall', n_jobs=-1)
        if data_type == 'Balanced':
            grid_search.fit(X_train_bal, y_train_bal)
        else:
            grid_search.fit(X_train, y_train)
        
        print(f"\nBest Parameters for {model_name}: {grid_search.best_params_}")
        model = grid_search.best_estimator_
    
    if data_type == 'Balanced':
        model.fit(X_train_bal, y_train_bal)
    else:
        model.fit(X_train, y_train)
    
    y_pred_test = model.predict(X_test)
    y_pred_holdout = model.predict(X_holdout)
    test_probs = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_test)
    holdout_probs = model.predict_proba(X_holdout)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_holdout)
    
    print("\nTuned Model Performance:")
    print(f"Test Precision: {precision_score(y_test, y_pred_test):.4f}")
    print(f"Test Recall: {recall_score(y_test, y_pred_test):.4f}")
    print(f"Test F1: {f1_score(y_test, y_pred_test):.4f}")
    print(f"Test ROC-AUC: {roc_auc_score(y_test, test_probs):.4f}")
    print(f"Holdout Precision: {precision_score(y_holdout, y_pred_holdout):.4f}")
    print(f"Holdout Recall: {recall_score(y_holdout, y_pred_holdout):.4f}")
    print(f"Holdout F1: {f1_score(y_holdout, y_pred_holdout):.4f}")
    print(f"Holdout ROC-AUC: {roc_auc_score(y_holdout, holdout_probs):.4f}")
    
    # Save model, preprocessor, and label encoder
    joblib.dump(model, 'best_model.pkl')
    joblib.dump(preprocessor, 'preprocessor.pkl')
    joblib.dump(le, 'label_encoder.pkl')
    print("Saved model as 'best_model.pkl', preprocessor as 'preprocessor.pkl', and label encoder as 'label_encoder.pkl'")
    
    return model

# Main Execution
if __name__ == "__main__":
    try:
        data = load_data()
        data = clean_data(data)
        X, y, features = feature_engineering(data)
        results_df, best_model_row, X_train, X_test, y_train, y_test, X_train_bal, y_train_bal, X_holdout, y_holdout = build_models(X, y, features)
        tuned_model = tune_model(best_model_row, X_train, X_test, y_train, y_test, X_train_bal, y_train_bal, X_holdout, y_holdout)
        
        print("\nKey Insights:")
        print("- Model trained and saved successfully.")
        print(f"- Best Model: {best_model_row['Model']} (Test F1: {best_model_row['Test F1']:.4f}, Holdout F1: {best_model_row['Holdout F1']:.4f}).")
    
    except FileNotFoundError as e:
        print(e)
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Missing Values Percentage:
show_id          0.000000
type             0.000000
title            0.000000
director        29.908028
cast             9.367549
country          9.435676
date_added       0.113546
release_year     0.000000
rating           0.045418
duration         0.034064
listed_in        0.000000
description      0.000000
dtype: float64

Initial rows: 8807

Correcting erroneous rating '66 min'

Correcting erroneous rating '74 min'

Correcting erroneous rating '84 min'

Excluding exclusive ratings: ['G', 'NC-17', 'PG', 'PG-13', 'UR']
Rows after excluding ratings: 7983

Checking duration column for non-string or missing values:
Rows with invalid duration (NaN or non-string): 0

Rows with unrealistic movie durations (<15 min or >300 min): 14
Unrealistic movie durations:
     show_id   type duration  movie_duration_min
71       s72  Movie   13 min                13.0
694     s695  Movie   13 min                13.0
695     s696  Movie   12 min                12.0
1425   s142

  File "C:\Users\arpit\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\arpit\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\arpit\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\arpit\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



Model Performance Summary:
                  Model      Data  Train Precision  Train Recall  Train F1  \
0   Logistic Regression  Original         0.666009      0.542872  0.598169   
1   Logistic Regression  Balanced         0.681118      0.715587  0.697927   
2           Naive Bayes  Original         0.423236      0.989818  0.592937   
3           Naive Bayes  Balanced         0.540947      0.982794  0.697808   
4         Decision Tree  Original         0.988466      0.918542  0.952222   
5         Decision Tree  Balanced         0.989825      0.951754  0.970416   
6         Random Forest  Original         0.961329      0.945874  0.953539   
7         Random Forest  Balanced         0.971622      0.970310  0.970966   
8              AdaBoost  Original         0.663179      0.543408  0.597349   
9              AdaBoost  Balanced         0.717594      0.752699  0.734727   
10              XGBoost  Original         0.884811      0.839764  0.861699   
11              XGBoost  Balanced   