NIM  : 23.21.1572
NAMA : Choirul Affan Adi Putra

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Load Dataset
df = pd.read_csv("netflix_titles.csv")

In [4]:
# Profile Dataset
print("Dataset Info:")
print(df.info())
print("\nDataset Head:")
print(df.head())
print("\nMissing Values:")
print(df.isnull().sum())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
None

Dataset Head:
  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              

In [5]:
# Data Preprocessing
# Handling Missing Values
df.fillna("Unknown", inplace=True)

In [6]:
# Encoding Categorical Variables
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])
df['rating'] = le.fit_transform(df['rating'])
df['listed_in'] = le.fit_transform(df['listed_in'])
df['director'] = le.fit_transform(df['director'])
df['cast'] = le.fit_transform(df['cast'])

df = df[['type', 'rating', 'listed_in', 'director', 'cast']]

In [7]:
# Feature Selection
X = df.drop(columns=['type'])
y = df['type']
selector = SelectKBest(score_func=chi2, k=3)
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print("Selected Features:", selected_features)

Selected Features: Index(['listed_in', 'director', 'cast'], dtype='object')


In [8]:
# Modeling
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [9]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9846765039727582
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1214
           1       0.99      0.96      0.98       548

    accuracy                           0.98      1762
   macro avg       0.99      0.98      0.98      1762
weighted avg       0.98      0.98      0.98      1762

Confusion Matrix:
[[1208    6]
 [  21  527]]
