In [None]:
import pandas as pd
from sklearn.utils import resample

from google.colab import drive
drive.mount('/content/drive')

INPUT_FILE = "/content/drive/MyDrive/tfm/final/df_eng.csv"
OUTPUT_FILE = "/content/drive/MyDrive/tfm/final/predictions_dataset.csv"

# === 🟢 LOAD DATA ===
print("Loading dataset...")
df = pd.read_csv(INPUT_FILE)

# === 🟢 REMOVE ENTRIES BEFORE 1960 ===
df = df[df['year'] >= 1960]

# === 🟢 REMOVE 'misc' GENRE ===
df = df[df['tag'] != 'misc']

# === 🟢 REMOVE EXTREME OUTLIERS IN VIEWS ===
views_upper_limit = df['views'].quantile(0.99)  # Keep up to 99th percentile
df = df[df['views'] <= views_upper_limit]

# === 🟢 HANDLE GENRE IMBALANCE ===
# Identify genre distribution
genre_counts = df['tag'].value_counts()
target_size = min(genre_counts.max(), 50000)  # Set a cap (adjustable)

# Resample genres to create a balanced dataset
balanced_df = pd.DataFrame()
for genre in genre_counts.index:
    genre_subset = df[df['tag'] == genre]
    if len(genre_subset) > target_size:
        genre_subset = resample(genre_subset, replace=False, n_samples=target_size, random_state=42)
    balanced_df = pd.concat([balanced_df, genre_subset])

# === 🟢 EXPORT STRATIFIED DATASET ===
balanced_df.to_csv(OUTPUT_FILE, index=False)
print(f"✅ Predictions dataset saved: {OUTPUT_FILE}")


Mounted at /content/drive
Loading dataset...
✅ Predictions dataset saved: /content/drive/MyDrive/tfm/final/predictions_dataset.csv


In [2]:
import pandas as pd
from google.colab import files

from google.colab import drive
drive.mount('/content/drive')

# Read the CSV file
df = pd.read_csv("/content/drive/MyDrive/tfm/final/predictions_dataset.csv")

# Create an excerpt (e.g., first 100 rows)
excerpt = df.head(100)
# Alternatively, filter by a condition:
# excerpt = df[df['year'] > 1990]

# Download the excerpt
excerpt.to_csv('predictions_excerpt.csv', index=False)
files.download('predictions_excerpt.csv')

Mounted at /content/drive


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>