In [1]:
import pandas as pd
from google.colab import drive

# === 🟢 MOUNT GOOGLE DRIVE ===
drive.mount('/content/drive')

# === 🟢 CONFIGURATION ===
INPUT_FILE = "/content/drive/MyDrive/tfm/final/df_eng.csv"
OUTPUT_FILE = "/content/drive/MyDrive/tfm/final/analytics_dataset_14.csv"
MIN_SAMPLES_PER_YEAR = 50  # Ensures no year is underrepresented
MAX_FRACTION = 0.1  # Keep the sample size reasonable for years with many records

# === 🟢 LOAD DATA ===
print("Loading dataset...")
df = pd.read_csv(INPUT_FILE)

# === 🟢 REMOVE ENTRIES BEFORE 1960 ===
df = df[df['year'] >= 1960]

# === 🟢 REMOVE 'misc' GENRE ===
df = df[df['tag'] != 'misc']

# === 🟢 STRATIFIED SAMPLING (ONLY YEAR & GENRE) ===
def smart_sample(group):
    """ Apply a dynamic sampling strategy per year-genre group. """
    sample_size = max(MIN_SAMPLES_PER_YEAR, int(len(group) * MAX_FRACTION))
    return group.sample(n=min(len(group), sample_size), random_state=42)

# Apply the improved sampling method
stratified_df = df.groupby(['year', 'tag'], group_keys=False).apply(smart_sample)

# === 🟢 EXPORT STRATIFIED DATASET ===
stratified_df.to_csv(OUTPUT_FILE, index=False)
print(f"✅ Fixed analytics dataset saved: {OUTPUT_FILE}")


Mounted at /content/drive
Loading dataset...


  stratified_df = df.groupby(['year', 'tag'], group_keys=False).apply(smart_sample)


✅ Fixed analytics dataset saved: /content/drive/MyDrive/tfm/final/analytics_dataset_14.csv
