In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load datasets
train_data = pd.read_csv("C:/GitHub/Machine-Learning/INF1279H/train_data.csv")
test_data = pd.read_csv("C:/GitHub/Machine-Learning/INF1279H/test_data.csv")

# Merge both datasets
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Ensure no missing values in the Rating column
combined_data = combined_data.dropna(subset=["Rating"])

# Define storage for new train and test datasets
train_balanced = pd.DataFrame()
test_balanced = pd.DataFrame()

# Define limits
TEST_RATIO = 0.2  # Ensure at least 20% of each rating remains in test dataset

# Process each rating category separately
for rating, group in combined_data.groupby("Rating"):
    # Ensure at least 20% of records are in test dataset
    test_size = max(int(len(group) * TEST_RATIO), 1)
    test_subset = group.sample(n=test_size, random_state=42)
    test_balanced = pd.concat([test_balanced, test_subset])

    # Remaining records (not in test set)
    remaining = group.drop(test_subset.index)

    # Find the minimum number of records across all ratings in train
    min_train_samples = min(len(remaining) for _, remaining in combined_data.groupby("Rating"))

    # Limit training samples for each rating to at most 3 times the minimum count
    train_size = min(len(remaining), min_train_samples * 3)
    
    # Select training subset
    train_subset = remaining.sample(n=train_size, random_state=42)
    train_balanced = pd.concat([train_balanced, train_subset])

    # Move any leftover records to test dataset
    leftover_records = remaining.drop(train_subset.index)
    test_balanced = pd.concat([test_balanced, leftover_records])

# Save new train/test datasets
train_balanced.to_csv("C:/GitHub/Machine-Learning/INF1279H/new_train_data.csv", index=False)
test_balanced.to_csv("C:/GitHub/Machine-Learning/INF1279H/new_test_data.csv", index=False)

# Print distribution check
print("New Train Dataset Rating Distribution:\n", train_balanced["Rating"].value_counts())
print("\nNew Test Dataset Rating Distribution:\n", test_balanced["Rating"].value_counts())

New Train Dataset Rating Distribution:
 Rating
5    2526
4    2526
3    2297
2    1252
1     674
Name: count, dtype: int64

New Test Dataset Rating Distribution:
 Rating
5    10605
4     2551
3      574
2      313
1      168
Name: count, dtype: int64
