In [None]:
import pandas as pd
import numpy as np
import joblib
import json
import tkinter as tk
from tkinter import ttk, messagebox, BooleanVar, Checkbutton
import os

# Load the model
model_path = r"C:\Users\Dani\Predictions-Final\models\best_model.pkl"
if not os.path.exists(model_path):
    model_path = os.path.join('models', 'best_model.pkl')

try:
    model = joblib.load(model_path)
except:
    # For development/testing when model isn't available
    print("Warning: Model file not found. Using dummy model for testing.")
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor(n_estimators=10)

# Load feature information
features_path = r"C:\Users\Dani\Predictions-Final\models\model_features.json"
if not os.path.exists(features_path):
    features_path = os.path.join('models', 'model_features.json')

try:
    with open(features_path, 'r') as f:
        features = json.load(f)
except:
    # For development/testing when json isn't available
    print("Warning: Features file not found. Using dummy feature lists.")
    features = {
        'numerical_columns': [
            'Core Viscosity (cP)', 'Emulsion viscosity (cP)', 'Vessel size (mL)', 
            'Spin Speed (rpm)', 'Outer viscosity (cP)', 'Dispersed Flow Rate (mL/min)', 
            'Continuous Flow Rate (mL/min)', 'Flow Rate Ratio', 'Length of curing tubing (m)', 
            'UV Power (J s-1)', 'Curing Energy (kJ g-1)'
        ],
        'categorical_columns': [
            'Core Formulation', 'Core used', 'UV used', 'UV formulation', 
            'Impellor', 'Outer used', 'Cure Rig Used'
        ]
    }

# All available columns from original dataset
available_numerical_cols = [
    'Emulsion viscosity (cP)',  # Moved to first position as most important
    'Core Viscosity (cP)',
    'Vessel size (mL)',
    'Spin Speed (rpm)',
    'Outer viscosity (cP)',
    'Dispersed Flow Rate (mL/min)',
    'Continuous Flow Rate (mL/min)',
    'Flow Rate Ratio',
    'Length of curing tubing (m)',
    'UV Power (J s-1)',
    'Curing Energy (kJ g-1)'
]

available_categorical_cols = [
    'Core Formulation',
    'Core used',
    'UV used',
    'UV formulation',
    'Impellor',
    'Outer used',
    'Cure Rig Used'
]

# Top 5 important features (based on feature importance analysis)
top_features = [
    'Viscosity Ratio (Emulsion/Core viscosity)',
    'Emulsion viscosity (cP)',
    'Core Viscosity (cP)',
    'UV Power (J s-1)',
    'Flow Rate Ratio (Continuous/Dispersed)'
]

# Common options for categorical inputs
categorical_options = {
    'Core Formulation': ['OG (66/27/7)', '66/26/6/2% PEG550DMA', '94/6 (25%)', '66/25.5/6.5/2% PEG550DMA', '64/24/10/2 HPC/PDO/CaCl2/PEG550DMA'],
    'Impellor': ['Impellorpellor', 'Medium 4x impellor', 'Large 4x impellor'],
    'Cure Rig Used': ['Cure Rig 1.0 (Water Butt)', 'Cure Rig 2.0 (Upgraded Water Butt)', 'Horizontal OG Cure Rig']
}

class PredictionApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Dry Percentage Predictor")
        self.root.geometry("900x700")
        
        # Create main frame
        main_frame = ttk.Frame(root, padding="10")
        main_frame.pack(fill=tk.BOTH, expand=True)
        
        # Create scrollable frame for inputs
        canvas = tk.Canvas(main_frame)
        scrollbar = ttk.Scrollbar(main_frame, orient="vertical", command=canvas.yview)
        self.scrollable_frame = ttk.Frame(canvas)
        
        self.scrollable_frame.bind(
            "<Configure>",
            lambda e: canvas.configure(scrollregion=canvas.bbox("all"))
        )
        
        canvas.create_window((0, 0), window=self.scrollable_frame, anchor="nw")
        canvas.configure(yscrollcommand=scrollbar.set)
        
        canvas.pack(side="left", fill="both", expand=True)
        scrollbar.pack(side="right", fill="y")
        
        # Title label
        title_label = ttk.Label(self.scrollable_frame, 
                              text="Dry Percentage Predictor", 
                              font=("Arial", 16, "bold"))
        title_label.grid(row=0, column=0, columnspan=4, pady=10, sticky="w")
        
        # Description label
        desc_label = ttk.Label(self.scrollable_frame, 
                             text="Enter the parameters you have available. More parameters will improve prediction accuracy.", 
                             font=("Arial", 10))
        desc_label.grid(row=1, column=0, columnspan=4, pady=5, sticky="w")
        
        # Add a note about important parameters
        importance_label = ttk.Label(self.scrollable_frame, 
                                   text="Top 5 important features:", 
                                   font=("Arial", 10, "italic"))
        importance_label.grid(row=2, column=0, columnspan=4, pady=(5,0), sticky="w")
        
        # List top 5 features
        for i, feature in enumerate(top_features):
            feature_label = ttk.Label(self.scrollable_frame, 
                                    text=f"{i+1}. {feature}", 
                                    font=("Arial", 9, "italic"))
            feature_label.grid(row=3+i, column=0, columnspan=4, pady=0, padx=(20,0), sticky="w")
        
        # Create input fields
        self.input_vars = {}
        self.enabled_vars = {}
        row = 8  # Start after the feature importance list
        
        # Numerical inputs label
        num_label = ttk.Label(self.scrollable_frame, 
                            text="Numerical Parameters", 
                            font=("Arial", 12, "bold"))
        num_label.grid(row=row, column=0, columnspan=4, pady=10, sticky="w")
        row += 1
        
        # Create numerical inputs
        for col in available_numerical_cols:
            # Checkbox to enable/disable this input
            enabled_var = BooleanVar(value=False)
            self.enabled_vars[col] = enabled_var
            enabled_check = Checkbutton(self.scrollable_frame, variable=enabled_var, 
                                        command=lambda c=col: self.toggle_input(c))
            enabled_check.grid(row=row, column=0, sticky="w", padx=5, pady=2)
            
            # Label
            label = ttk.Label(self.scrollable_frame, text=f"{col}:")
            label.grid(row=row, column=1, sticky="w", padx=5, pady=2)
            
            # Create StringVar for the entry - empty by default
            var = tk.StringVar(value="")
            self.input_vars[col] = var
            
            # Create the entry widget
            entry = ttk.Entry(self.scrollable_frame, textvariable=var, width=12, state="disabled")
            entry.grid(row=row, column=2, sticky="w", padx=5, pady=2)
            
            # Add units label
            units = col.split("(")[1].split(")")[0] if "(" in col else ""
            if units:
                units_label = ttk.Label(self.scrollable_frame, text=units)
                units_label.grid(row=row, column=3, sticky="w")
            
            row += 1
        
        # Categorical inputs label
        cat_label = ttk.Label(self.scrollable_frame, 
                            text="Categorical Parameters", 
                            font=("Arial", 12, "bold"))
        cat_label.grid(row=row, column=0, columnspan=4, pady=10, sticky="w")
        row += 1
        
        # Create categorical inputs
        for col in available_categorical_cols:
            # Checkbox to enable/disable this input
            enabled_var = BooleanVar(value=False)
            self.enabled_vars[col] = enabled_var
            enabled_check = Checkbutton(self.scrollable_frame, variable=enabled_var, 
                                        command=lambda c=col: self.toggle_input(c))
            enabled_check.grid(row=row, column=0, sticky="w", padx=5, pady=2)
            
            # Label
            label = ttk.Label(self.scrollable_frame, text=f"{col}:")
            label.grid(row=row, column=1, sticky="w", padx=5, pady=2)
            
            # Create StringVar for the entry - empty by default
            var = tk.StringVar(value="")
            self.input_vars[col] = var
            
            # Create dropdown or entry based on available options
            if col in categorical_options:
                input_widget = ttk.Combobox(self.scrollable_frame, textvariable=var, 
                                           values=categorical_options[col], width=25, 
                                           state="disabled")
            else:
                input_widget = ttk.Entry(self.scrollable_frame, textvariable=var, 
                                        width=25, state="disabled")
            
            input_widget.grid(row=row, column=2, sticky="ew", padx=5, pady=2, columnspan=2)
            
            row += 1
        
        # Quick selection buttons
        quick_frame = ttk.Frame(self.scrollable_frame)
        quick_frame.grid(row=row, column=0, columnspan=4, pady=10, sticky="w")
        
        minimal_btn = ttk.Button(quick_frame, text="Minimal Set", 
                              command=self.select_minimal)
        minimal_btn.pack(side="left", padx=5)
        
        recommended_btn = ttk.Button(quick_frame, text="Recommended Set", 
                                   command=self.select_recommended)
        recommended_btn.pack(side="left", padx=5)
        
        all_btn = ttk.Button(quick_frame, text="All Parameters", 
                          command=self.select_all)
        all_btn.pack(side="left", padx=5)
        
        clear_btn = ttk.Button(quick_frame, text="Clear All", 
                            command=self.clear_all)
        clear_btn.pack(side="left", padx=5)
        
        row += 1
        
        # Predict button
        predict_button = ttk.Button(self.scrollable_frame, 
                                  text="Predict", 
                                  command=self.predict,
                                  style="Accent.TButton")
        predict_button.grid(row=row, column=0, columnspan=4, pady=20)
        row += 1
        
        # Result frame
        result_frame = ttk.LabelFrame(self.scrollable_frame, text="Prediction Result")
        result_frame.grid(row=row, column=0, columnspan=4, sticky="ew", padx=5, pady=10)
        
        self.result_var = tk.StringVar(value="Results will appear here")
        result_label = ttk.Label(result_frame, textvariable=self.result_var, font=("Arial", 12))
        result_label.pack(padx=10, pady=10)
        
        # Confidence frame
        confidence_frame = ttk.Frame(result_frame)
        confidence_frame.pack(fill="x", padx=10, pady=5)
        
        ttk.Label(confidence_frame, text="Prediction confidence:").pack(side="left")
        
        self.confidence_bar = ttk.Progressbar(confidence_frame, length=200, mode='determinate')
        self.confidence_bar.pack(side="left", padx=10)
        
        self.confidence_label = ttk.Label(confidence_frame, text="0%")
        self.confidence_label.pack(side="left")
        
        # Configure grid column weights
        self.scrollable_frame.columnconfigure(2, weight=1)
    
    def toggle_input(self, column):
        """Enable or disable an input field based on checkbox"""
        enabled = self.enabled_vars[column].get()
        
        # Find the input widget for this column (entry or combobox)
        for widget in self.scrollable_frame.grid_slaves():
            grid_info = widget.grid_info()
            if grid_info.get("row") == None:
                continue
                
            # Calculate the expected row for the widget
            col_index = available_numerical_cols.index(column) if column in available_numerical_cols else -1
            cat_index = available_categorical_cols.index(column) if column in available_categorical_cols else -1
            
            expected_row = -1
            if col_index >= 0:
                expected_row = 9 + col_index  # Adjusted for header rows and feature importance
            elif cat_index >= 0:
                expected_row = 9 + len(available_numerical_cols) + 1 + cat_index  # +1 for category header
            
            if grid_info.get("column") == 2 and grid_info.get("row") == expected_row:
                # Set state based on enabled status
                widget.configure(state="normal" if enabled else "disabled")
                break
    
    def select_minimal(self):
        """Select only the essential parameters"""
        self.clear_all()
        essential_params = [
            'Emulsion viscosity (cP)',  # Prioritize emulsion viscosity
            'Dispersed Flow Rate (mL/min)',
            'Continuous Flow Rate (mL/min)',
            'Core Formulation'
        ]
        
        for param in essential_params:
            if param in self.enabled_vars:
                self.enabled_vars[param].set(True)
                self.toggle_input(param)
    
    def select_recommended(self):
        """Select the recommended set of parameters"""
        self.clear_all()
        recommended_params = [
            'Emulsion viscosity (cP)',  # Prioritize emulsion viscosity
            'Core Viscosity (cP)',
            'Vessel size (mL)',
            'Spin Speed (rpm)',
            'Dispersed Flow Rate (mL/min)',
            'Continuous Flow Rate (mL/min)',
            'UV Power (J s-1)',
            'Core Formulation',
            'Impellor',
            'Cure Rig Used'
        ]
        
        for param in recommended_params:
            if param in self.enabled_vars:
                self.enabled_vars[param].set(True)
                self.toggle_input(param)
    
    def select_all(self):
        """Select all parameters"""
        for param in self.enabled_vars:
            self.enabled_vars[param].set(True)
            self.toggle_input(param)
    
    def clear_all(self):
        """Deselect all parameters"""
        for param in self.enabled_vars:
            self.enabled_vars[param].set(False)
            self.toggle_input(param)
            # Clear input values
            self.input_vars[param].set("")
    
    def calculate_confidence_score(self, input_data, enabled_count):
        """Calculate a confidence score for the prediction based on input data quality"""
        base_score = min(40, enabled_count * 5)  # Base score based on number of parameters
        
        # Check if key features are present
        if 'Emulsion viscosity (cP)' in input_data:
            base_score += 20  # Major bonus for having emulsion viscosity
        
        # Add bonus for viscosity ratio
        if 'Core Viscosity (cP)' in input_data and 'Emulsion viscosity (cP)' in input_data:
            try:
                viscosity_ratio = input_data['Emulsion viscosity (cP)'] / input_data['Core Viscosity (cP)']
                
                # Check if viscosity ratio is in optimal range (0.05-0.07)
                if 0.05 <= viscosity_ratio <= 0.07:
                    base_score += 20  # Optimal range
                elif 0.03 <= viscosity_ratio <= 0.09:
                    base_score += 10  # Good range
            except:
                pass  # In case of division by zero
        
        # Check for flow rate information
        if 'Dispersed Flow Rate (mL/min)' in input_data and 'Continuous Flow Rate (mL/min)' in input_data:
            try:
                flow_ratio = input_data['Continuous Flow Rate (mL/min)'] / input_data['Dispersed Flow Rate (mL/min)']
                
                # Check if flow ratio is in optimal range (5-10)
                if 5 <= flow_ratio <= 10:
                    base_score += 15  # Optimal range
                elif 3 <= flow_ratio <= 15:
                    base_score += 8  # Good range
            except:
                pass  # In case of division by zero
        
        # Add bonus for UV Power
        if 'UV Power (J s-1)' in input_data:
            base_score += 5
        
        # Cap at 100%
        return min(100, base_score)
    
    def predict(self):
        try:
            # Collect input values from enabled fields
            input_data = {}
            enabled_count = 0
            
            # Check if any parameters are enabled
            if not any(self.enabled_vars.values()):
                messagebox.showerror("Input Error", "Please enable at least one parameter to make a prediction.")
                return
            
            # Process numerical inputs
            for col in available_numerical_cols:
                if self.enabled_vars[col].get():
                    try:
                        if self.input_vars[col].get().strip():  # Only process non-empty fields
                            value = float(self.input_vars[col].get())
                            input_data[col] = value
                            enabled_count += 1
                        else:
                            messagebox.showerror("Input Error", f"Please enter a value for {col} or uncheck it.")
                            return
                    except ValueError:
                        messagebox.showerror("Input Error", f"Please enter a valid number for {col}")
                        return
            
            # Process categorical inputs
            for col in available_categorical_cols:
                if self.enabled_vars[col].get():
                    value = self.input_vars[col].get()
                    if not value.strip():
                        messagebox.showerror("Input Error", f"Please enter a value for {col} or uncheck it.")
                        return
                    input_data[col] = value
                    enabled_count += 1
            
            # Calculate confidence score
            confidence_score = self.calculate_confidence_score(input_data, enabled_count)
            
            # Update confidence bar
            self.confidence_bar['value'] = confidence_score
            self.confidence_label.config(text=f"{confidence_score}%")
            
            # Create a row for prediction with all required features
            # Instead of calculating engineered features directly, we'll create a baseline row
            # and only fill in the values we actually have
            
            # Define default values for all features
            default_values = {
                'Core Viscosity (cP)': 11000,
                'Emulsion viscosity (cP)': 600,
                'Vessel size (mL)': 5000,
                'Spin Speed (rpm)': 125,
                'Outer viscosity (cP)': 12,
                'Dispersed Flow Rate (mL/min)': 30,
                'Continuous Flow Rate (mL/min)': 300,
                'Flow Rate Ratio': 10,
                'Length of curing tubing (m)': 50,
                'UV Power (J s-1)': 708,
                'Curing Energy (kJ g-1)': 1.5,
                'Core Formulation': 'OG (66/27/7)',
                'Core used': 'BNC00044',
                'UV used': 'BNU00074',
                'UV formulation': 'AUF76',
                'Impellor': 'Impellorpellor',
                'Outer used': 'BNO00104', 
                'Cure Rig Used': 'Cure Rig 1.0 (Water Butt)',
                
                # Include default values for engineered features
                'Viscosity_Ratio': 600/11000,
                'Emulsion viscosity (cP)_x_Core Viscosity (cP)': 600 * 11000,
                'Emulsion viscosity (cP)_to_Core Viscosity (cP)': 600/11000,
                'Emulsion viscosity (cP)_x_Vessel size (mL)': 600 * 5000,
                'Emulsion viscosity (cP)_to_Vessel size (mL)': 600/5000,
                'Emulsion viscosity (cP)_x_UV Power (J s-1)': 600 * 708,
                'Emulsion viscosity (cP)_to_UV Power (J s-1)': 600/708,
                'Core Viscosity (cP)_x_Vessel size (mL)': 11000 * 5000,
                'Core Viscosity (cP)_to_Vessel size (mL)': 11000/5000,
                'Core Viscosity (cP)_x_UV Power (J s-1)': 11000 * 708,
                'Core Viscosity (cP)_to_UV Power (J s-1)': 11000/708,
                'Energy_Density': 708/30,
                'Residence_Time': 50/300,
                'Emulsion viscosity (cP)_squared': 600**2,
                'Emulsion viscosity (cP)_sqrt': np.sqrt(600),
                'Core Viscosity (cP)_squared': 11000**2,
                'Core Viscosity (cP)_sqrt': np.sqrt(11000),
                'Dispersed Flow Rate (mL/min)_x_Core Viscosity (cP)': 30 * 11000,
                'Dispersed Flow Rate (mL/min)_to_Core Viscosity (cP)': 30/11000,
                'Dispersed Flow Rate (mL/min)_x_Vessel size (mL)': 30 * 5000,
                'Dispersed Flow Rate (mL/min)_to_Vessel size (mL)': 30/5000,
                'Dispersed Flow Rate (mL/min)_x_UV Power (J s-1)': 30 * 708,
                'Dispersed Flow Rate (mL/min)_to_UV Power (J s-1)': 30/708,
                'Dispersed Flow Rate (mL/min)_squared': 30**2,
                'Dispersed Flow Rate (mL/min)_sqrt': np.sqrt(30),
                'Emulsion viscosity (cP)_x_Dispersed Flow Rate (mL/min)': 600 * 30,
                'Emulsion viscosity (cP)_to_Dispersed Flow Rate (mL/min)': 600/30
            }
            
            # Create a prediction row with default values for all features
            prediction_row = default_values.copy()
            
            # Update with the actual input values
            for key, value in input_data.items():
                prediction_row[key] = value
            
            # Only recalculate engineered features if we have the required inputs
            # This way, if a feature is missing, we'll use the default
            
            # Viscosity ratio
            if 'Core Viscosity (cP)' in input_data and 'Emulsion viscosity (cP)' in input_data:
                if input_data['Core Viscosity (cP)'] > 0:
                    ratio = input_data['Emulsion viscosity (cP)'] / input_data['Core Viscosity (cP)']
                    prediction_row['Viscosity_Ratio'] = ratio
                    prediction_row['Emulsion viscosity (cP)_to_Core Viscosity (cP)'] = ratio
            
            # Flow ratio
            if 'Dispersed Flow Rate (mL/min)' in input_data and 'Continuous Flow Rate (mL/min)' in input_data:
                if input_data['Dispersed Flow Rate (mL/min)'] > 0:
                    ratio = input_data['Continuous Flow Rate (mL/min)'] / input_data['Dispersed Flow Rate (mL/min)']
                    prediction_row['Flow Rate Ratio'] = ratio
            
            # Selectively update other interaction features only if we have both inputs
            pairs = [
                ('Emulsion viscosity (cP)', 'Core Viscosity (cP)'),
                ('Emulsion viscosity (cP)', 'Vessel size (mL)'),
                ('Emulsion viscosity (cP)', 'UV Power (J s-1)'),
                ('Core Viscosity (cP)', 'Vessel size (mL)'),
                ('Core Viscosity (cP)', 'UV Power (J s-1)'),
                ('Dispersed Flow Rate (mL/min)', 'Core Viscosity (cP)'),
                ('Dispersed Flow Rate (mL/min)', 'Vessel size (mL)'),
                ('Dispersed Flow Rate (mL/min)', 'UV Power (J s-1)'),
                ('Emulsion viscosity (cP)', 'Dispersed Flow Rate (mL/min)')
            ]
            
            for feat1, feat2 in pairs:
                if feat1 in input_data and feat2 in input_data:
                    # Multiplication interaction
                    mult_key = f"{feat1}_x_{feat2}"
                    if mult_key in prediction_row:
                        prediction_row[mult_key] = input_data[feat1] * input_data[feat2]
                    
                    # Division interaction (avoid division by zero)
                    if input_data[feat2] > 0:
                        div_key = f"{feat1}_to_{feat2}"
                        if div_key in prediction_row:
                            prediction_row[div_key] = input_data[feat1] / input_data[feat2]
            
            # Squared and sqrt features
            for feat in ['Emulsion viscosity (cP)', 'Core Viscosity (cP)', 'Dispersed Flow Rate (mL/min)']:
                if feat in input_data:
                    # Squared
                    squared_key = f"{feat}_squared"
                    if squared_key in prediction_row:
                        prediction_row[squared_key] = input_data[feat] ** 2
                    
                    # Square root (for positive values)
                    if input_data[feat] > 0:
                        sqrt_key = f"{feat}_sqrt"
                        if sqrt_key in prediction_row:
                            prediction_row[sqrt_key] = np.sqrt(input_data[feat])
            
            # Energy density
            if 'UV Power (J s-1)' in input_data and 'Dispersed Flow Rate (mL/min)' in input_data:
                if input_data['Dispersed Flow Rate (mL/min)'] > 0:
                    prediction_row['Energy_Density'] = input_data['UV Power (J s-1)'] / input_data['Dispersed Flow Rate (mL/min)']
            
            # Residence time
            if 'Length of curing tubing (m)' in input_data and 'Continuous Flow Rate (mL/min)' in input_data:
                if input_data['Continuous Flow Rate (mL/min)'] > 0:
                    prediction_row['Residence_Time'] = input_data['Length of curing tubing (m)'] / input_data['Continuous Flow Rate (mL/min)']
            
            # Make prediction
            input_df = pd.DataFrame([prediction_row])
            
            # For each required feature, ensure it's in the DataFrame
            required_features = features['numerical_columns'] + features['categorical_columns']
            for feature in required_features:
                if feature not in input_df.columns:
                    # Add missing column with a default value
                    # This ensures the model always gets the columns it expects
                    if feature in default_values:
                        input_df[feature] = default_values[feature]
                    else:
                        # If no default, use 0 for numerical, empty string for categorical
                        input_df[feature] = 0 if feature in features['numerical_columns'] else ""
            
            # Make the prediction
            prediction = model.predict(input_df)[0]
            
            # Determine confidence level
            if confidence_score >= 80:
                confidence_text = "High"
            elif confidence_score >= 60:
                confidence_text = "Medium"
            else:
                confidence_text = "Low"
            
            # Determine recommendation based on prediction value
            if prediction <= 0.3:
                recommendation = "Excellent dryness - proceed with experiment"
            elif prediction <= 0.5:
                recommendation = "Good dryness - proceed with experiment"
            elif prediction <= 0.7:
                recommendation = "Moderate dryness - consider adjustments"
            else:
                recommendation = "Poor dryness - adjust parameters"
            
            # Update result
            self.result_var.set(f"Predicted %dry after 24h in incubator: {prediction:.3f}\n" +
                               f"Confidence: {confidence_text}\n" +
                               f"{recommendation}")
            
        except Exception as e:
            messagebox.showerror("Prediction Error", f"Error making prediction: {str(e)}")
            import traceback
            traceback.print_exc()  # Print the full error for debugging

# Run the application
if __name__ == "__main__":
    root = tk.Tk()
    app = PredictionApp(root)
    root.mainloop()