In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
# Load the dataset
data = pd.read_csv('Netflix_Movies_and_TV_Shows.csv')
print   (data)

           Title     Type     Genre  Release Year Rating   Duration  \
0        Title 1  TV Show    Comedy          1955     PG  3 Seasons   
1        Title 2  TV Show    Horror          2020      G  3 Seasons   
2        Title 3  TV Show    Action          1966  TV-PG    140 min   
3        Title 4    Movie  Thriller          2011  PG-13  3 Seasons   
4        Title 5  TV Show   Romance          1959  TV-14    172 min   
...          ...      ...       ...           ...    ...        ...   
2995  Title 2996    Movie  Thriller          2007  TV-PG     75 min   
2996  Title 2997    Movie     Drama          2019      G  2 Seasons   
2997  Title 2998  TV Show    Action          1993      R  3 Seasons   
2998  Title 2999    Movie     Drama          1966  PG-13  1 Seasons   
2999  Title 3000  TV Show  Thriller          2015     PG  2 Seasons   

            Country  
0             Japan  
1             India  
2     United States  
3            Canada  
4             India  
...            

In [3]:
# Feature and Target Separation
# Dropping 'Title' as it's non-numeric and using 'Type' as the target variable
X = data.drop(columns=['Type', 'Title'])
y = data['Type']


In [4]:
# Handle missing values
X.fillna(0, inplace=True)  # Replace NaNs with zeros or use another appropriate strategy

# Ensure all columns in X are numeric
X = X.apply(pd.to_numeric, errors='coerce')

In [5]:
# Check for and handle infinite values
X.replace([np.inf, -np.inf], 0, inplace=True)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [7]:
# Bat Algorithm Parameters
num_bats = 10
max_iter = 10
loudness = 0.5
pulse_rate = 0.5
frequency_min, frequency_max = 0, 2

# Fitness Function with Additional Metrics
def fitness_function_with_metrics(solution):
    selected_features = np.where(solution > 0.5)[0]
    if len(selected_features) == 0:
        return 0, 0, 0, 0  # Return 0 for accuracy, precision, recall, and F1 if no features are selected

    # Train model on selected features
    clf = RandomForestClassifier(random_state=42, class_weight='balanced')
    clf.fit(X_train[:, selected_features], y_train)
    preds = clf.predict(X_test[:, selected_features])

    # Calculate metrics
    accuracy = accuracy_score(y_test, preds)
    precision = precision_score(y_test, preds, pos_label="Movie", zero_division=0)
    recall = recall_score(y_test, preds, pos_label="Movie", zero_division=0)
    f1 = f1_score(y_test, preds, pos_label="Movie", zero_division=0)

    return accuracy, precision, recall, f1


In [8]:
# Initialize bats
positions = np.random.rand(num_bats, X.shape[1])  # Random positions in [0,1]
velocities = np.random.uniform(-1, 1, (num_bats, X.shape[1]))  # Random velocities
best_global = positions[np.random.randint(0, num_bats)]  # Randomly select a bat
best_global_accuracy, best_global_precision, best_global_recall, best_global_f1 = fitness_function_with_metrics(best_global)


In [9]:
# Bat Algorithm
for t in range(max_iter):
    for i in range(num_bats):
        # Calculate frequency and update velocity and position
        frequency = frequency_min + (frequency_max - frequency_min) * np.random.rand()
        velocities[i] += (positions[i] - best_global) * frequency
        positions[i] = np.clip(positions[i] + velocities[i], 0, 1)  # Ensure positions stay in range

        # Local search
        if np.random.rand() > pulse_rate:
            positions[i] = np.clip(best_global + np.random.normal(0, 0.1, size=X.shape[1]), 0, 1)

        # Evaluate fitness
        accuracy, precision, recall, f1 = fitness_function_with_metrics(positions[i])
        if accuracy > best_global_accuracy and np.random.rand() < loudness:
            best_global = positions[i]
            best_global_accuracy, best_global_precision, best_global_recall, best_global_f1 = accuracy, precision, recall, f1

    # Update loudness and pulse rate
    loudness = max(0.1, loudness * 0.95)
    pulse_rate = min(1.0, pulse_rate * 1.05)

    # Print progress
    print(f"Iteration {t + 1}:")
    print(f"   Best Accuracy = {best_global_accuracy:.4f}")
    print(f"   Best Precision = {best_global_precision:.4f}")
    print(f"   Best Recall = {best_global_recall:.4f}")
    print(f"   Best F1 Score = {best_global_f1:.4f}")
    selected_features = np.where(best_global > 0.5)[0]
    print("   Selected Features:", X.columns[selected_features].tolist())
    print()


Iteration 1:
   Best Accuracy = 0.4989
   Best Precision = 0.4797
   Best Recall = 0.4631
   Best F1 Score = 0.4713
   Selected Features: ['Genre', 'Release Year', 'Rating', 'Country']

Iteration 2:
   Best Accuracy = 0.4989
   Best Precision = 0.4797
   Best Recall = 0.4631
   Best F1 Score = 0.4713
   Selected Features: ['Genre', 'Release Year', 'Rating', 'Country']

Iteration 3:
   Best Accuracy = 0.4989
   Best Precision = 0.4797
   Best Recall = 0.4631
   Best F1 Score = 0.4713
   Selected Features: ['Genre', 'Release Year', 'Rating', 'Country']

Iteration 4:
   Best Accuracy = 0.4989
   Best Precision = 0.4797
   Best Recall = 0.4631
   Best F1 Score = 0.4713
   Selected Features: ['Genre', 'Release Year', 'Rating', 'Country']

Iteration 5:
   Best Accuracy = 0.4989
   Best Precision = 0.4797
   Best Recall = 0.4631
   Best F1 Score = 0.4713
   Selected Features: ['Genre', 'Release Year', 'Rating', 'Country']

Iteration 6:
   Best Accuracy = 0.4989
   Best Precision = 0.4797
   B

In [10]:
# Final Results
print("Final Results:")
print(f"Best Accuracy: {best_global_accuracy:.4f}")
print(f"Best Precision: {best_global_precision:.4f}")
print(f"Best Recall: {best_global_recall:.4f}")
print(f"Best F1 Score: {best_global_f1:.4f}")
print("Selected Features:", X.columns[selected_features].tolist())


Final Results:
Best Accuracy: 0.4989
Best Precision: 0.4797
Best Recall: 0.4631
Best F1 Score: 0.4713
Selected Features: ['Release Year', 'Rating', 'Country']
