In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
import matplotlib.pyplot as plt
import cutslib as cl
import cutslib.glitch as gl
from cutslib.visual import array_plots
import matplotlib.cm as cm
from tqdm import tqdm
import logging
import os

def setup_logging(log_file='active_learning.log'):
    """Configure logging."""
    logging.basicConfig(
        filename=log_file,
        level=logging.INFO,
        format='%(asctime)s:%(levelname)s:%(message)s'
    )

def load_data(df_dir, df_test_name, df_train_name, df_train_high_prob_name):
    """Load DataFrames from CSV files."""
    try:
        df_test = pd.read_csv(os.path.join(df_dir, f'{df_test_name}.csv'))
        df_train = pd.read_csv(f'/home/simran/examples_for_zoey/plots_for_labelling/manifest.csv')
        df_train_high_prob = pd.read_csv(f'/home/simran/examples_for_zoey/plots_for_labelling/manifest.csv')
        logging.info("Successfully loaded all DataFrames.")
        return df_test, df_train, df_train_high_prob
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        raise

def initialize_active_learner(df_train, cols_t, n_trees=50, max_depth=15):
    """Initialize the Active Learner with RandomForestClassifier."""
    try:
        X_start = df_train[cols_t].to_numpy()
        Y_start = df_train['Train_Lab'].to_numpy()
        
        learner = ActiveLearner(
            estimator=RandomForestClassifier(
                criterion='entropy',
                n_estimators=n_trees,
                random_state=1,
                n_jobs=2,
                max_depth=max_depth
            ),
            query_strategy=uncertainty_sampling,
            X_training=X_start,
            y_training=Y_start
        )
        logging.info("Initialized ActiveLearner.")
        return learner, X_start, Y_start
    except Exception as e:
        logging.error(f"Error initializing ActiveLearner: {e}")
        raise

def perform_active_learning(learner, df_test, cols, cols_t, n_queries=5):
    """Perform the active learning loop."""
    try:
        X_add = df_test[cols].to_numpy()
        Y_add = df_test['Train_Lab'].to_numpy()
        df_new = pd.DataFrame()
        
        for _ in range(n_queries):
            query_idx, query_instance = learner.query(X_add[:, :len(cols_t)])
            
            if isinstance(query_idx, (int, np.integer)):
                query_idx = [query_idx]
            
            # Extract the queried instance(s)
            queried_X = X_add[query_idx, :len(cols_t)]
            queried_y = Y_add[query_idx]
            
            # Append to df_new with all necessary columns from df_test
            df_new = pd.concat([df_new, df_test.iloc[query_idx][cols].reset_index(drop=True)], ignore_index=True)
            logging.info(f"Appended {len(query_idx)} data point(s) to df_new.")
            
            # Remove the queried instance(s) from X_add and Y_add
            X_add = np.delete(X_add, query_idx, axis=0)
            Y_add = np.delete(Y_add, query_idx, axis=0)
            logging.info(f"Removed {len(query_idx)} data point(s) from the unlabeled pool.")
            
            # Teach the learner with the new instance(s)
            learner.teach(
                X=queried_X,
                y=queried_y
            )
            logging.info(f"Taught the learner with {len(query_idx)} new data point(s).")
        
        return df_new, X_add, Y_add
    except Exception as e:
        logging.error(f"Error during active learning: {e}")
        raise

def select_high_probability_data(df_train_high_prob, df_train, classifier, cols_t, cols, threshold=0.6, num_per_class=5):
    """Select high-probability data points based on classifier predictions."""
    try:
        # Merge to ensure all required columns are present
        merged_df = df_train_high_prob.merge(
            df_train,
            on='Start Index',  # Replace with the actual key if different
            how='left',
            suffixes=('_high_prob', '_original')
        )
        
        missing_cols = set(cols) - set(merged_df.columns)
        if missing_cols:
            for col in missing_cols:
                merged_df[col] = np.nan  # Assign NaN or appropriate default
            logging.warning(f"Added missing columns with default values: {missing_cols}")
        
        # Select required columns
        merged_df = merged_df[cols]
        
        # Extract features
        X_data_high_prob = merged_df[cols_t].to_numpy()
        
        # Predict class probabilities
        probabilities = classifier.predict_proba(X_data_high_prob)
        
        # Define the number of indices you want per class
        selected_indices = []
        
        for class_idx in range(probabilities.shape[1]):
            class_prob = probabilities[:, class_idx]
            filtered_indices = np.where(class_prob > threshold)[0]
            if len(filtered_indices) > num_per_class:
                selected = np.random.choice(filtered_indices, size=num_per_class, replace=False)
            else:
                selected = filtered_indices
            selected_indices.extend(selected)
        
        # Remove duplicates if any
        selected_indices = list(set(selected_indices))
        logging.info(f"Selected {len(selected_indices)} high-probability data points.")
        
        # Extract the corresponding rows
        df_high_prob_selected = merged_df.iloc[selected_indices]
        
        # Ensure that df_high_prob_selected has all required columns
        missing_cols_final = set(cols) - set(df_high_prob_selected.columns)
        if missing_cols_final:
            raise ValueError(f"The following required columns are missing in df_high_prob_selected: {missing_cols_final}")
        
        return df_high_prob_selected
    except Exception as e:
        logging.error(f"Error selecting high-probability data points: {e}")
        raise

def generate_images(df_new, cols, outdir):
    """Generate and save images, and prepare manifest."""
    # Ensure 'TOD' and other necessary columns are present in df_new
    required_columns = ['TOD', 'Start Index', 'Stop Index']
    missing_required = set(required_columns) - set(df_new.columns)
    if missing_required:
        raise ValueError(f"Missing required columns in df_new: {missing_required}")
    
    tod_names = df_new['TOD'].unique()
    logging.info(f"Found {len(tod_names)} unique TODs to process.")
    
    pbar = tqdm(total=len(tod_names), desc="Generating Images")
    df_tosave_all = []
    
    for tod_name in tod_names:
        df_temp = df_new[df_new['TOD'] == tod_name].reset_index(drop=True)
        df_temp['Focal Plane Image'] = 'temp'
        df_temp['Time Stream Image'] = 'temp'
        
        tod_file_path = os.path.join('/home/yilun/shared/depots/yilun/', tod_name)
        try:
            # Check if the TOD file exists
            if not os.path.isfile(tod_file_path):
                raise FileNotFoundError(f"TOD file not found: {tod_file_path}")
            
            # Load the TOD
            tod = cl.load_tod(
                tod_name,
                depot='/home/yilun/shared/depots/yilun/',
                autoloads=['cuts', 'partial', 'cal'],
                release='20230220'
            )
            cl.quick_transform(tod, steps=['demean', 'detrend', 'ff_mce', 'cal', 'f_glitch'])
            logging.info(f"Loaded and transformed TOD: {tod_name}")
            
        except FileNotFoundError as fnf_error:
            logging.error(fnf_error)
            # Skip image generation for the missing TOD and continue
            pbar.update(1)
            continue
        except Exception as e:
            logging.error(f"Failed to load or transform TOD {tod_name}: {e}")
            pbar.update(1)
            continue
        
        # Get detectors that pass det cuts
        dets = tod.cuts.get_uncut()
        
        # Create a mask for uncut detectors
        det_mask = np.zeros(len(tod.det_uid), dtype=bool)
        det_mask[dets] = True
        
        # Collapse all partial cuts to see how many detectors are cut at each time
        try:
            n_affected = np.sum([tod.pcuts.cuts[det].get_mask() for det in dets], axis=0)
        except Exception as e:
            logging.error(f"Error calculating n_affected for TOD {tod_name}: {e}")
            pbar.update(1)
            continue
        
        # Initialize time slices
        tslices = np.zeros((len(df_temp), 2), dtype=int)
        
        for i in range(len(df_temp)):
            try:
                tslices[i] = [
                    int(df_temp.loc[i, 'Start Index']),
                    int(df_temp.loc[i, 'Stop Index'])
                ]
            except Exception as e:
                logging.error(f"Error parsing time slices for TOD {tod_name}, row {i}: {e}")
                tslices[i] = [0, 0]  # Assign default values or handle appropriately
        
        # Generate snippets
        try:
            snippets = gl.affected_snippets_from_cv(tod, tod.pcuts, tslices, det_mask)
        except Exception as e:
            logging.error(f"Error generating snippets for TOD {tod_name}: {e}")
            pbar.update(1)
            continue
        
        for i, snip in enumerate(snippets):
            try:
                # Assign image names
                focal_image_name = f'TOD_{tod_name}_entire_array_tslice_{tslices[i][0]}-{tslices[i][1]}.png'
                timestr_image_name = f'TOD_{tod_name}_snipTOD_tslice_{tslices[i][0]}-{tslices[i][1]}.png'
                df_temp.at[i, 'Focal Plane Image'] = focal_image_name
                df_temp.at[i, 'Time Stream Image'] = timestr_image_name
                
                _dets_affected = np.zeros(len(tod.det_uid), dtype=bool)
                _dets_affected[snip.det_uid] = True
                _dets_affected = _dets_affected.astype(int)
                sel_f090 = tod.info.array_data['fcode'] == tod_name[-4:]
                
                # Plot and save Focal Plane Image
                array_plots(
                    _dets_affected[sel_f090],
                    det=tod.det_uid[sel_f090],
                    tod=tod,
                    cmap=cm.rainbow,
                    display='save',
                    save_name=os.path.join(outdir, focal_image_name)
                )
                logging.info(f"Saved Focal Plane Image: {focal_image_name}")
                
                # Plot and save Time Stream Image
                plt.figure()
                snip.demean().plot(color='purple', alpha=0.2)
                plt.grid(ls='--')
                plt.tight_layout()
                plt.savefig(
                    os.path.join(outdir, timestr_image_name),
                    dpi=100
                )
                plt.close()  # Close the figure to free memory
                logging.info(f"Saved Time Stream Image: {timestr_image_name}")
            except Exception as e:
                logging.error(f"Error generating images for TOD {tod_name}, snippet {i}: {e}")
                continue  # Skip to the next snippet
        
        # Append to the list
        df_tosave_all.append(df_temp[['Focal Plane Image', 'Time Stream Image'] + cols])
        pbar.update(1)
    
    pbar.close()
    
    # Concatenate all saved data
    if df_tosave_all:
        try:
            df_tosave = pd.concat(df_tosave_all, ignore_index=True)
            logging.info(f"Total data points to save: {len(df_tosave)}")
        except Exception as e:
            logging.error(f"Error concatenating df_tosave_all: {e}")
            raise
    else:
        logging.error("No data points were processed successfully. df_tosave_all is empty.")
        # Create an empty DataFrame with the required columns
        df_tosave = pd.DataFrame(columns=['Focal Plane Image', 'Time Stream Image'] + cols)
    
    return df_tosave

def save_manifest(df_tosave, outdir, manifest_filename='manifest.csv'):
    """Save the manifest DataFrame to CSV."""
    try:
        manifest_path = os.path.join(outdir, manifest_filename)
        df_tosave.to_csv(manifest_path, index=False)
        logging.info(f"Saved manifest to {manifest_path}")
        return manifest_path
    except Exception as e:
        logging.error(f"Error saving manifest: {e}")
        raise

def run_active_learning(
    df_dir,
    df_test_name,
    df_train_name,
    df_train_high_prob_name,
    cols_t,
    cols,
    outdir,
    n_queries=5,
    high_prob_threshold=0.6,
    high_prob_num_per_class=5
):
    """
    Main function to run active learning and generate manifest.csv.
    
    Parameters:
        df_dir (str): Directory containing the CSV files.
        df_test_name (str): Filename for the test DataFrame (without .csv).
        df_train_name (str): Filename for the train DataFrame (without .csv).
        df_train_high_prob_name (str): Filename for the high-prob train DataFrame (without .csv).
        cols_t (list): Columns used for training.
        cols (list): All columns to be saved in the dataframe at the end.
        outdir (str): Output directory for images and manifest.
        n_queries (int): Number of active learning queries.
        high_prob_threshold (float): Probability threshold for high-probability data points.
        high_prob_num_per_class (int): Number of high-probability data points per class.
    
    Returns:
        str: Path to the generated manifest.csv file.
    """
    try:
        setup_logging()
        df_test, df_train, df_train_high_prob = load_data(df_dir, df_test_name, df_train_name, df_train_high_prob_name)
        learner, X_start, Y_start = initialize_active_learner(df_train, cols_t, n_trees=50, max_depth=15)
        df_new, X_add, Y_add = perform_active_learning(learner, df_test, cols, cols_t, n_queries=n_queries)
        df_high_prob_selected = select_high_probability_data(
            df_train_high_prob,
            df_train,
            learner.estimator,
            cols_t,
            cols,
            threshold=high_prob_threshold,
            num_per_class=high_prob_num_per_class
        )
        df_new = pd.concat([df_new, df_high_prob_selected], ignore_index=True)
        logging.info(f"Added {len(df_high_prob_selected)} high-probability data points to df_new.")
        print(f"Added {len(df_high_prob_selected)} high-probability data points to df_new.")
        df_tosave = generate_images(df_new, cols, outdir)
        manifest_path = save_manifest(df_tosave, outdir)
        logging.info("Active learning and image generation completed successfully.")
        return manifest_path
    except Exception as e:
        logging.error(f"An error occurred in run_active_learning: {e}")
        raise


In [None]:
import os
import subprocess
from active_learning_module import run_active_learning
from panoptes_client import Panoptes, Workflow, SubjectSet
from panoptes_client import Panoptes
from panoptes_client.panoptes import Panoptes
from panoptes_client import SubjectSet, Subject
import logging

def create_subject_set(name,workflow_id):
    """Create a new Subject Set."""
    try:
        subject_set = SubjectSet()
        subject_set.links.workflow = Workflow.find(workflow_id).id
        subject_set.display_name = name
        subject_set.save()
        logging.info(f"Created new Subject Set: {name}")
        return subject_set
    except Exception as e:
        logging.error(f"Error creating Subject Set: {e}")
        raise

def run_bash_command(command, shell='/bin/bash'):
    print(f"Running command: {command}")
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable=shell, text=True)

    # Read the output in real-time
    while True:
        output = process.stdout.readline()
        if output == '' and process.poll() is not None:
            break
        if output:
            print(output.strip())
    
    # Capture any remaining error output
    stderr = process.stderr.read()
    if stderr:
        print(f"Command error: {stderr.strip()}")
    
    return process.returncode

def upload_subjects(subject_set_id, manifest_path):
    """Upload subjects to the specified Subject Set from the manifest file."""
    command = f"panoptes subject-set upload-subjects {subject_set_id} {manifest_path}"
    return run_bash_command(command)

def main():
    # Configure logging
    logging.basicConfig(
        filename='framework.log',
        level=logging.INFO,
        format='%(asctime)s:%(levelname)s:%(message)s'
    )
    
    # Securely fetch credentials from environment variables
    username = os.getenv('ZOONIVERSE_USERNAME')
    password = os.getenv('ZOONIVERSE_PASSWORD')
    
    if not username or not password:
        logging.error("Zooniverse credentials not set in environment variables.")
        print("Error: Zooniverse credentials not set in environment variables.")
        return
    
    description = 'Zooniverse Python client-backed project'
    
    # Connect to Panoptes
    try:
        Panoptes.connect(username=username, password=password)
        logging.info("Connected to Panoptes successfully.")
        print("Connection established.")
    except Exception as e:
        logging.error(f"Failed to connect to Panoptes: {e}")
        print("Failed to connect to Panoptes.")
        return
    
    # Specify your workflow ID here

    workflow_id = os.getenv('WORKFLOW_ID')
    try:
        workflow = Workflow.find(workflow_id)
    except Exception as e:
        logging.error(f"Failed to find Workflow with ID {workflow_id}: {e}")
        print(f"Failed to find Workflow with ID {workflow_id}.")
        return
    
    if workflow.subjects_count == workflow.retired_set_member_subjects_count: 
        # When it's the initial run or if all data from the previous batch has been labeled 
        # then trigger active learning loop
        subject_set_name = input("Enter subject set name: ")
        try:
            subject_set = create_subject_set(subject_set_name, workflow_id)
            subject_set_id = subject_set.id 
            print(f"New subject set created, with Subject Set ID: {subject_set_id}")
        except Exception as e:
            print("Failed to create Subject Set.")
            return
    
        # Define parameters for active learning
        df_dir = '/home/simran/examples_for_zoey/' # replace with your data set directory
        df_test_name = 'predicted_labels_depth1_1619094281_pa5_f090_wseason' # replace with your test set file name
        cols_for_training = ['Number of Detectors', 'Y and X Extent Ratio','Y Hist Max and Adjacent/Number of Detectors',
          'Within 0.1 of Y Hist Max/Number of Detectors', 'Mean abs(Correlation)', 'Mean abs(Time Lag)', 'Number of Peaks']
          
        outdir = '/home/zoey/example_dir' # replace with your output director
        n_queries = 5
        high_prob_threshold = 0.6
        high_prob_num_per_class = 5
        
        # Run active learning module and output the manifest file for selected data
        try:
            manifest_path = run_active_learning(
                df_dir=df_dir,
                df_test_name=df_test_name,
                # df_train_name=df_train_name,
                # df_train_high_prob_name=df_train_high_prob_name,
                cols_t=cols_for_training,
                # cols=cols,
                outdir=outdir,
                n_queries=n_queries,
                high_prob_threshold=high_prob_threshold,
                high_prob_num_per_class=high_prob_num_per_class
            )
            print(f"Active learning completed. Manifest saved at {manifest_path}.")
        except Exception as e:
            logging.error(f"Active learning failed: {e}")
            print("Active learning failed. Check logs for details.")
            return
        
        # Get user input for manifest file path
        file_path = manifest_path  # Using the output from the module
        
        # Upload subjects
        return_code = upload_subjects(subject_set_id, file_path)
        
        # Check return code
        if return_code == 0:
            print("Subjects uploaded successfully.")
            logging.info("Subjects uploaded successfully.")
        else:
            print("Subject upload failed.")
            logging.error("Subject upload failed.")
    else:
        # When the previous batch's labeling is not complete
        print("Workflow already has active subjects. No action taken.")
        logging.info("Workflow already has active subjects. No action taken.")

if __name__ == "__main__":
    main()
