In [1]:
from artifactremoval.premodelprocessing import *
from pathlib import Path 

import numpy as np
from sklearn.model_selection import train_test_split, GroupShuffleSplit
import datetime as dt

In [2]:
## CHANGE THESE DIRECTORIES WHEN THE TIME COMES

base_dir = Path.cwd().parent
input_dir = base_dir / "data" / "ratings" / "individual_csv"
output_dir = base_dir / "data" / "ratings" / "aggregate_data"

input_spectral_data_dir = base_dir / "data" / "processed" / '20250411_202222'
input_spectral_file = input_spectral_data_dir / "unique_ids_group_25_20250411_202222_raters.pkl"

output_aggregated_csv_file = output_dir / "aggregated_spectral_ratings.csv"

log_file = Path("11v4_premodelprocessing.log")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler(str(log_file), mode="w"),
        logging.StreamHandler()
    ]
)

In [3]:
csv_files = list(input_dir.glob("*.csv"))
csv_files = [str(file) for file in csv_files]

logging.info(f"CSV files found: {csv_files}")

logging.info("Cleaning 'Poor Quality' labels in individual CSVs…")

for file in csv_files:
    df = pd.read_csv(file)
    # Count how many ‘Poor Quality’ entries
    num_poor = (df['rating'] == "Poor Quality").sum()
    if num_poor > 0:
        logging.warning(f"{num_poor} 'Poor Quality' labels found in {file}; recoding to 'Unacceptable'")
        df['rating'] = df['rating'].replace("Poor Quality", "Unacceptable")
        # Save back to the same CSV
        df.to_csv(file, index=False)
        logging.info(f"Saved cleaned CSV: {file}")
    else:
        logging.info(f"No 'Poor Quality' labels in {file}")

# Read and concatenate all CSV files into one DataFrame.
df_list = [pd.read_csv(file) for file in csv_files]
all_ratings_df = pd.concat(df_list, ignore_index=True)
logging.info("Combined CSV file preview:")
logging.info("\n" + all_ratings_df.head().to_string())

# --------------------------------------
# Step 2: Aggregate Ratings
# --------------------------------------
aggregated_df = all_ratings_df.groupby('unique_id')\
    .apply(aggregate_ratings)\
    .reset_index()
logging.info("Aggregated Ratings (first few rows):")
logging.info("\n" + aggregated_df.head().to_string())

# --------------------------------------
# Step 2.1: Recode any 'Poor Quality' → 'Unacceptable' in consensus
# --------------------------------------
num_poor = (aggregated_df['consensus'] == "Poor Quality").sum()
if num_poor > 0:
    logging.warning(f"Found {num_poor} ‘Poor Quality’ consensus labels; recoding to 'Unacceptable'")
    aggregated_df['consensus'] = aggregated_df['consensus'].replace("Poor Quality", "Unacceptable")
else:
    logging.info("No 'Poor Quality' consensus labels found.")

# --------------------------------------
# Step 2.2: Export the cleaned, aggregated data
# --------------------------------------
aggregated_df.to_csv(output_aggregated_csv_file, index=False)
logging.info(f"Aggregated spectral ratings saved to '{output_aggregated_csv_file}'")

# --------------------------------------
# Step 3: Verification of the Aggregation
# --------------------------------------
inconsistencies = verify_aggregates(aggregated_df, all_ratings_df)
if not inconsistencies:
    logging.info("Verification succeeded: All aggregated ratings match the individual CSV data.")
else:
    logging.error("Inconsistencies found:")
    for inc in inconsistencies:
        logging.error(inc)

2025-04-23 16:40:04,879 - INFO - CSV files found: ['x:\\ArtifactRemovalProject\\data\\ratings\\individual_csv\\unique_ids_group_25_20250411_202222_raters_completed_Maudsley.csv', 'x:\\ArtifactRemovalProject\\data\\ratings\\individual_csv\\unique_ids_group_25_20250411_202222_raters_completed_Mellon.csv', 'x:\\ArtifactRemovalProject\\data\\ratings\\individual_csv\\unique_ids_group_25_20250411_202222_raters_completed_Poptani.csv', 'x:\\ArtifactRemovalProject\\data\\ratings\\individual_csv\\unique_ids_group_25_20250411_202222_raters_completed_Sheriff.csv', 'x:\\ArtifactRemovalProject\\data\\ratings\\individual_csv\\unique_ids_group_25_20250411_202222_raters_completed_Shim.csv', 'x:\\ArtifactRemovalProject\\data\\ratings\\individual_csv\\unique_ids_group_25_20250411_202222_raters_completed_Shu.csv', 'x:\\ArtifactRemovalProject\\data\\ratings\\individual_csv\\unique_ids_group_25_20250411_202222_raters_completed_Soher.csv', 'x:\\ArtifactRemovalProject\\data\\ratings\\individual_csv\\unique_id

In [4]:
# Step 1: Load spectral data.
spectral_data = load_spectral_data(input_spectral_file)
aggregated_csv_file = output_aggregated_csv_file

# Step 2: Load aggregated CSV data.
logging.info(f"Loading aggregated CSV data from: {aggregated_csv_file}")
aggregated_df = pd.read_csv(aggregated_csv_file)
logging.info(f"Aggregated CSV loaded with {len(aggregated_df)} rows.")

# Step 3: Create a consensus mapping from the aggregated CSV.
consensus_mapping = aggregated_df.set_index('unique_id')['consensus'].to_dict()
logging.info("Consensus mapping created successfully.")

# Step 4: Update spectral data with consensus ratings.
spectral_data = update_spectral_data_with_consensus(spectral_data, consensus_mapping)

# Step 5: Verification - Display 10 random entries.
display_random_consensus_entries(spectral_data, aggregated_df, sample_size=10)

subject_ids = [extract_subject_id(rec["unique_id"]) for rec in spectral_data]

# ----- 2.  Build X, y, groups for the splitter -------------------
X  = np.arange(len(spectral_data))                 # indices only
y  = [rec["consensus_rating"] for rec in spectral_data]   # labels for stratification
g  = subject_ids                                   # group identifier

# ----- 3.  Group-stratified split (20 % test) --------------------
gss = GroupShuffleSplit(test_size=0.10, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=g))

train_data = [spectral_data[i] for i in train_idx]
test_data  = [spectral_data[i] for i in test_idx]

train_data_filtered = [e for e in train_data if e.get("consensus_rating") is not None]
test_data_filtered = [e for e in test_data if e.get("consensus_rating") is not None]

logging.info(f"Reserved {len(test_data_filtered)} / {len(spectral_data)} entries "
             f"({len(test_idx)/len(spectral_data):.1%}) for the test set.")

2025-04-23 16:40:12,061 - INFO - Loading spectral data from pickle file: x:\ArtifactRemovalProject\data\processed\20250411_202222\unique_ids_group_25_20250411_202222_raters.pkl
2025-04-23 16:40:12,518 - INFO - Spectral data loaded successfully.
2025-04-23 16:40:12,520 - INFO - Loading aggregated CSV data from: x:\ArtifactRemovalProject\data\ratings\aggregate_data\aggregated_spectral_ratings.csv
2025-04-23 16:40:12,548 - INFO - Aggregated CSV loaded with 5257 rows.
2025-04-23 16:40:12,553 - INFO - Consensus mapping created successfully.
2025-04-23 16:40:12,554 - INFO - Updating spectral data with consensus ratings.
2025-04-23 16:40:12,732 - INFO - Spectral data updated with consensus ratings.
2025-04-23 16:40:12,735 - INFO - Displaying 10 random spectral entries for verification.
2025-04-23 16:40:12,737 - INFO - Unique ID: DOSEESC_UM08_07.11.2018_14_48_36
2025-04-23 16:40:12,738 - INFO - Spectral Data Consensus Rating: Acceptable
2025-04-23 16:40:12,739 - INFO - Aggregated CSV Row:
2025

In [5]:
timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M")

with open(output_dir / f"spectral_train_{timestamp}.pkl", "wb") as f:
    pickle.dump(train_data, f)

with open(output_dir / f"spectral_test_{timestamp}.pkl",  "wb") as f:
    pickle.dump(test_data, f)

logging.info("Train/test pickles written successfully")


2025-04-23 16:40:15,943 - INFO - Train/test pickles written successfully
