## Notes

We are likely overfitting here due to low sample size. For now, we keep this example for future reference as the procedure is valid.

### Imports

In [None]:
import logging
import sys
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

sys.path.append(os.path.abspath("../.."))

from scripts.database import get_session, load_data_to_db
from scripts.utils import load_config, setup_logging
from models import NetflixDataset


### Config and Logging Setup

In [2]:
try:
    config = load_config()
    setup_logging(config['paths']['log_path'])
    logging.info("Starting the data analysis project.")
except Exception as e:
    logging.error(f"Failed to load config or setup logging: {e}")
    raise

INFO:root:Starting the data analysis project.
Starting the data analysis project.


### Database Session

In [3]:
try:
    session = get_session()
    logging.info("Database session created successfully.")
except Exception as e:
    logging.error(f"Failed to create database session: {e}")
    raise

INFO:root:Database session created successfully.
Database session created successfully.


### Query netflix Data

In [4]:
try:
    show_data = session.query(NetflixDataset).all()
    data = [show.__dict__ for show in show_data]
    for item in data:
        item.pop('_sa_instance_state', None)
    
    show_df = pd.DataFrame(data)
    print(show_df.head())
except Exception as e:
    logging.error(f"Failed to query show data: {e}")
    raise


                   title     type         director        country  \
0   Dick Johnson Is Dead    Movie  Kirsten Johnson  United States   
1          Blood & Water  TV Show                    South Africa   
2              Ganglands  TV Show  Julien Leclercq                  
3  Jailbirds New Orleans  TV Show                                   
4           Kota Factory  TV Show                           India   

   release_year   duration                                        description  \
0          2020     90 min  As her father nears the end of his life, filmm...   
1          2021  2 Seasons  After crossing paths at a party, a Cape Town t...   
2          2021   1 Season  To protect his family from a powerful drug lor...   
3          2021   1 Season  Feuds, flirtations and toilet talk go down amo...   
4          2021  2 Seasons  In a city of coaching centers known to train I...   

                                                cast show_id  \
0                                 

### Feature Engineering and Mutual Information

In [5]:
X = show_df['text']
y = show_df['type']

# Text vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Train classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9977298524404086

Classification Report:
               precision    recall  f1-score   support

       Movie       1.00      1.00      1.00      1203
     TV Show       1.00      1.00      1.00       559

    accuracy                           1.00      1762
   macro avg       1.00      1.00      1.00      1762
weighted avg       1.00      1.00      1.00      1762

