In [25]:
import time
import datetime
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from collections import Counter
# Hämta nuvarande datum och tid
now = datetime.datetime.now()
# Formatera som strängq
timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")

class SurveyDataProcessor:
    """Processes survey data with ML-based imputation for missing values."""

    def __init__(self, filepath):
        """Initialize with survey data filepath."""
        self.filepath = filepath
        self.df_original = None
        self.df_clean = None
        self.models = {}
        self.scalers = {}
        self.feature_sets = {}
        self.imputation_log = []

    def load_and_prepare_data(self):
        """Load data and apply column mappings."""
        print('Loading and preparing survey data...')

        # Load data
        self.df_original = pd.read_excel(self.filepath)

        # Rename columns
        self._rename_columns()

        # Apply categorical mappings
        self._apply_categorical_mappings()

        # Create composite variables
        self._create_composite_variables()

        print(f"Data loaded: {len(self.df_original)} rows")

    def _rename_columns(self):
        """Rename columns to shorter, standardized names."""
        column_mapping = {
            'Tidstämpel': 'TIME',
            'Kön': 'GDR',
            'Åldersgrupp': 'AGE',
            'Län': 'COUNTY',
            'Vilken/vilka språkmodell(er) använder du idag i ditt arbete?': 'UB1',
            'När var första gången du använde en språkmodell i ditt arbete?': 'EXP',
            'Hur ofta har du använt språkmodeller i ditt arbete under det senaste året?': 'UB2',
            'Jag upplever att språkmodeller är i allmänhet användbara i mitt arbete som lärare.': 'PE1',
            'Språkmodeller hjälper mig att planera och förbereda min undervisning snabbare än traditionella metoder.': 'PE2',
            'Genom att använda språkmodeller kan jag öka antalet eller variationen av undervisningsmaterial jag producerar.': 'PE3',
            'Språkmodeller förbättrar kvaliteten på mitt arbete genom att hjälpa mig med administrativa och rutinmässiga arbetsuppgifter.': 'PE4',
            'Det krävs lite ansträngning för mig att bli kompetent i att använda språkmodeller för läraruppgifter.': 'EE1',
            'Språkmodeller är/skulle vara intuitiva och okomplicerade att använda i mitt dagliga arbete som lärare.': 'EE2',
            'Inlärningsprocessen för att komma igång med språkmodeller kräver/skulle kräva minimal ansträngning från min sida.': 'EE3',
            'Personer i mitt privatliv (familj och vänner) anser att jag bör använda stora språkmodeller i mitt arbete som lärare.': 'SI1',
            'Mina ämneskollegor och andra lärare på skolan stödjer/uppmuntrar till att använda språkmodeller i undervisningen.': 'SI2',
            'Skolledningen (rektor och ledningsgrupp) stödjer/uppmuntrar aktivt användningen av språkmodeller i undervisningen.': 'SI3',
            'Jag har tillgång till nödvändig teknisk utrustning (dator, internetuppkoppling, etc.) för att effektivt använda stora språkmodeller i min undervisning.': 'FC1',
            'Jag har tillräcklig kunskap om språkmodeller för att kunna använda de på ett effektivt sätt.': 'FC2',
            'Språkmodeller fungerar väl tillsammans med de digitala lärplattformar och andra verktyg jag redan använder.': 'FC3',
            'Det finns tillgång till specialiserad IT-support eller kollegor med expertkunskap som kan hjälpa mig när jag stöter på problem med språkmodeller.': 'FC4',
            'Skolledningen har uttryckligen förmedlat en förväntan om att jag ska integrera språkmodeller i min undervisningspraktik.': 'VOL1',
            'Jag har frihet att själv bestämma i vilken utsträckning och för vilka syften jag använder språkmodeller.': 'VOL2',
            'Användning av språkmodeller är ett formellt krav i min tjänst och ingår i skolans officiella riktlinjer för digitala verktyg.': 'VOL3',
            'Inom vilken tidsram förväntar du dig att börja använda/öka din användning av språkmodeller i ditt arbete?': 'BI1',
            'Baserat på mina erfarenheter hittills, räknar jag med att språkmodeller kommer bli ett regelbundet verktyg i min undervisning framöver.': 'BI2',
            'Jag ser långsiktiga möjligheter att integrera språkmodeller i min professionella utveckling som lärare och i utformningen av mina kurser.': 'BI3',
            'Tillåter du att dina elever använder språkmodeller (som ChatGPT) i sitt skolarbete?': 'STU'
        }
        self.df_original = self.df_original.rename(columns=column_mapping)

    def _apply_categorical_mappings(self):
        """Apply categorical variable mappings."""
        # County mapping
        county_mapping = {
            'Blekinge län': 0, 'Dalarnas län': 1, 'Gotlands län': 2, 'Gävleborgs län': 3,
            'Hallands län': 4, 'Jämtlands län': 5, 'Jönköpings län': 6, 'Kalmar län': 7,
            'Kronobergs län': 8, 'Norrbottens län': 9, 'Örebro län': 10, 'Östergötlands län': 11,
            'Stockholms län': 12, 'Södermanlands län': 13, 'Skåne län': 14, 'Värmlands län': 15,
            'Västerbottens län': 16, 'Västmanlands län': 17, 'Västra Götalands län': 18,
            'Uppsala län': 19, 'Västernorrlands län': 20
        }

        # Age mapping
        age_mapping = {
            '25-29 år': 0, '30-34 år': 1, '35-39 år': 2, '40-44 år': 3,
            '45-49 år': 4, '50-54 år': 5, '55-59 år': 6, '60-64 år': 7, '65 år eller äldre': 8
        }

        # Experience mapping
        exp_mapping = {
            'Jag har aldrig använt en språkmodell i mitt arbete': 0,
            'För mindre än 1 månad sen': 1, 'För 1–3 månader sen': 2, 'För 4–12 månader sen': 3,
            'För 1–2 år sen': 4, 'För mer än 2 år sen': 5
        }

        # Usage frequency mapping
        ub2_mapping = {
            'Aldrig': 0, 'Mer sällan än en gång i månaden': 1, 'Någon gång i månaden': 2,
            'Några gånger i veckan': 3, 'Dagligen': 4
        }

        # Behavioral intention mapping
        bi1_mapping = {
            'Aldrig': 0, 'Inte inom de fem kommande åren': 1,
            'Inom två till fem år': 2, 'Inom ett år': 3,
            'Inom de sex kommande månaderna': 4, 'Jag använder redan språkmodeller': 5
        }

        # Apply mappings
        self.df_original['COUNTY'] = self.df_original['COUNTY'].map(county_mapping)
        self.df_original['AGE'] = self.df_original['AGE'].map(age_mapping)
        self.df_original['EXP'] = self.df_original['EXP'].map(exp_mapping)
        self.df_original['UB2'] = self.df_original['UB2'].map(ub2_mapping)
        self.df_original['BI1'] = self.df_original['BI1'].map(bi1_mapping)
        self.df_original['STU'] = self.df_original['STU'].map({'Nej': 0, 'Ja': 1})
        self.df_original['UB1'] = self.df_original['UB1'].apply(
            lambda x: 1 if 'Använder ej' not in str(x) else 0
        )

    def _create_composite_variables(self):
        """Create composite variables from individual items."""
        # Invert coercive variables
        self.df_original['VOL1'] = 8 - self.df_original['VOL1']
        self.df_original['VOL3'] = 8 - self.df_original['VOL3']

        # Create composite scores
        self.df_original['VOL'] = self.df_original[['VOL1', 'VOL3']].mean(axis=1)
        self.df_original['PE'] = self.df_original[['PE1', 'PE2', 'PE3', 'PE4']].mean(axis=1)
        self.df_original['EE'] = self.df_original[['EE2', 'EE3']].mean(axis=1)
        self.df_original['SI'] = self.df_original[['SI1', 'SI2', 'SI3']].mean(axis=1)
        self.df_original['FC'] = self.df_original[['FC1', 'FC2', 'FC3']].mean(axis=1)
        self.df_original['BI'] = self.df_original[['BI1', 'BI2', 'BI3']].mean(axis=1)

        # Normalize BI to 0-1 scale
        bi_min, bi_max = self.df_original['BI'].min(), self.df_original['BI'].max()
        self.df_original['BI'] = (self.df_original['BI'] - bi_min) / (bi_max - bi_min)

        self.df_original['UB'] = self.df_original[['UB1', 'UB2']].mean(axis=1)

    def analyze_missing_values(self):
        """Analyze and report missing values in the dataset."""
        print("\n" + "="*60)
        print("MISSING VALUES ANALYSIS")
        print("="*60)

        # Find rows with missing values
        nan_rows = self.df_original[self.df_original.isna().any(axis=1)]
        print(f'Total rows with missing values: {len(nan_rows)}')

        if len(nan_rows) > 0:
            print('\nDetails of rows with missing values:')
            for index, row in nan_rows.iterrows():
                nan_cols = row[row.isna()].index.tolist()
                print(f"Row {index}: missing values in columns {nan_cols}")

        # Gender distribution
        gender_count = self.df_original['GDR'].value_counts(dropna=False)
        print(f'\nGender distribution:')
        for category, count in gender_count.items():
            print(f"  {category}: {count}")

    def prepare_training_data(self):
        """Prepare clean dataset for model training."""
        # Remove unnecessary columns and clean data
        self.df_clean = self.df_original.drop(['TIME', 'EE1', 'FC4', 'VOL2'], axis=1)
        self.df_clean = self.df_clean[self.df_clean['GDR'] != 'Annat']
        self.df_clean['GDR'] = self.df_clean['GDR'].map({'Kvinna': 0, 'Man': 1})
        self.df_clean = self.df_clean.dropna()
        self.df_clean.to_excel(f'{timestamp}_pure-data.xlsx', index=False)
        print(f"Clean dataset for training: {len(self.df_clean)} rows")

    def train_model(self, target_column, label_names):
        """Train a Random Forest model for the specified target."""
        print(f"\nTraining model for {target_column}...")

        # Prepare features (exclude target and composite variables)
        composite_vars = ['PE', 'EE', 'SI', 'FC', 'BI', 'UB']
        X = self.df_clean.drop(columns=[target_column] + composite_vars)
        y = self.df_clean[target_column]

        # Feature selection using Random Forest importance
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_selector.fit(X_scaled, y)

        # Select top 8 features
        importance_df = pd.DataFrame({
            'Feature': X.columns,
            'Importance': rf_selector.feature_importances_
        })
        top_features = importance_df.sort_values('Importance', ascending=False).head(8)['Feature'].tolist()

        # Train final model with selected features
        X_selected = self.df_clean[top_features]
        X_selected_scaled = scaler.fit_transform(X_selected)
        X_train, X_test, y_train, y_test = train_test_split(
            X_selected_scaled, y, test_size=0.2, random_state=42
        )

        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        # Evaluate model
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Model accuracy: {accuracy:.3f}")

        # Store model artifacts
        self.models[target_column] = model
        self.scalers[target_column] = scaler
        self.feature_sets[target_column] = top_features

        # Generate confusion matrix visualization
        self._create_confusion_matrix(y_test, y_pred, target_column, label_names)

        return model, scaler, top_features

    def _create_confusion_matrix(self, y_test, y_pred, target_name, label_names):
        """Create and save confusion matrix visualization."""
        cm = confusion_matrix(y_test, y_pred)

        # Calculate performance metrics
        tn, fp, fn, tp = cm.ravel() if cm.shape == (2, 2) else (cm[0,0], cm[0,1], cm[1,0], cm[1,1])
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        # Set academic formatting
        plt.rcParams.update({
            'font.family': 'serif',
            'font.serif': ['Times New Roman'],
            'font.size': 12,
            'axes.linewidth': 1.0,
            'axes.edgecolor': 'black'
        })

        # Create confusion matrix table
        fig, ax = plt.subplots(figsize=(8, 6))
        ax.axis('tight')
        ax.axis('off')

        table_data = [
            ['', 'Predikterad klass', ''],
            ['', label_names[0], label_names[1]],
            [f'Verklig klass', '', ''],
            [label_names[0], str(cm[0,0]), str(cm[0,1])],
            [label_names[1], str(cm[1,0]), str(cm[1,1])]
        ]

        table = ax.table(cellText=table_data, loc='center', cellLoc='center')
        table.auto_set_font_size(False)
        table.set_fontsize(12)
        table.scale(1.2, 2)

        # Format table cells
        for i in range(len(table_data)):
            for j in range(len(table_data[0])):
                cell = table[(i, j)]
                if i == 0 or (i == 2 and j == 0):
                    cell.set_text_props(weight='bold', style='italic')
                elif i == 1 or (i > 2 and j == 0):
                    cell.set_text_props(weight='bold')

                if i == 0:
                    cell.set_linewidth(2)
                elif i == 2:
                    cell.set_linewidth(1.5)
                else:
                    cell.set_linewidth(1)

        # Add title and metrics
        target_display = 'STU-klassificering' if target_name == 'STU' else 'Könsklassificering'
        table_num = 1 if target_name == 'STU' else 2
        plt.suptitle(f'Tabell {table_num}. Konfusionsmatris för {target_display}',
                     fontsize=12, fontweight='bold', y=0.85)

        metrics_text = f"""Klassificeringsprestanda:
Noggrannhet: {accuracy:.3f} ({tp + tn}/{tp + tn + fp + fn})
Precision: {precision:.3f} ({tp}/{tp + fp})
Återkallelse: {recall:.3f} ({tp}/{tp + fn})
F1-Poäng: {f1:.3f}
Specificitet: {specificity:.3f} ({tn}/{tn + fp})

Not: Värdena representerar antalet prover klassificerade i varje kategori
(n = {len(y_test)} testprover)."""

        plt.figtext(0.1, 0.3, metrics_text, fontsize=10, verticalalignment='top',
                    bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray", alpha=0.3))

        plt.tight_layout()
        plt.subplots_adjust(bottom=0.4)

        filename = f'Tabell_{table_num}_Konfusionsmatris_{target_name}.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight', facecolor='white')
        plt.close()

        print(f"Confusion matrix saved as: {filename}")

    def perform_imputation(self):
        """Perform detailed imputation tracking for missing values."""
        print("\n" + "="*60)
        print("DETAILED IMPUTATION TRACKING")
        print("="*60)

        df_imputation = self.df_original.copy()
        self.imputation_log = []

        # Initialize imputation tracking columns
        self._initialize_imputation_columns(df_imputation)

        # STU imputation
        self._impute_stu_values(df_imputation)

        # GDR imputation
        self._impute_gdr_values(df_imputation)

        # Save results
        self._save_imputation_results(df_imputation)

        return df_imputation

    def _initialize_imputation_columns(self, df):
        """Initialize columns to track which values were imputed."""
        # Create imputation tracking columns for each variable that might be imputed
        df['STU_IMPUTED'] = 0  # 0 = original, 1 = imputed
        df['GDR_IMPUTED'] = 0  # 0 = original, 1 = imputed

        # Mark rows where original data was missing or 'Annat'
        df.loc[df['STU'].isnull(), 'STU_IMPUTED'] = 1
        df.loc[df['GDR'] == 'Annat', 'GDR_IMPUTED'] = 1

        print(f"Initialized imputation tracking columns:")
        print(f"  STU values to be imputed: {df['STU_IMPUTED'].sum()}")
        print(f"  GDR values to be imputed: {df['GDR_IMPUTED'].sum()}")

    def _impute_stu_values(self, df):
        """Impute missing STU values using trained model."""
        print(f"\nSTU IMPUTATION:")
        print("-" * 40)

        stu_missing_mask = df['STU'].isnull()
        stu_missing_indices = df[stu_missing_mask].index.tolist()

        print(f"Rows with missing STU values: {len(stu_missing_indices)}")

        if len(stu_missing_indices) == 0:
            print("No STU values need imputation")
            return

        if 'STU' not in self.models:
            print("STU model not trained. Skipping STU imputation.")
            return

        try:
            df_missing = df[stu_missing_mask]
            features = self.feature_sets['STU']
            X_missing = df_missing[features]

            # Check for NaN values in required features
            if X_missing.isnull().any().any():
                print("Warning: NaN values in required features for STU prediction")
                valid_mask = ~X_missing.isnull().any(axis=1)
                X_valid = X_missing[valid_mask]
                valid_indices = X_valid.index.tolist()

                if len(valid_indices) > 0:
                    X_scaled = self.scalers['STU'].transform(X_valid)
                    predictions = self.models['STU'].predict(X_scaled)

                    for i, idx in enumerate(valid_indices):
                        old_value = df.loc[idx, 'STU']
                        new_value = predictions[i]
                        df.loc[idx, 'STU'] = new_value

                        self._log_imputation(idx, 'STU', old_value, new_value, 'Model prediction')
                        print(f"  Row {idx}: STU imputed from {old_value} to {new_value}")
            else:
                X_scaled = self.scalers['STU'].transform(X_missing)
                predictions = self.models['STU'].predict(X_scaled)

                for i, idx in enumerate(stu_missing_indices):
                    old_value = df.loc[idx, 'STU']
                    new_value = predictions[i]
                    df.loc[idx, 'STU'] = new_value

                    self._log_imputation(idx, 'STU', old_value, new_value, 'Model prediction')
                    print(f"  Row {idx}: STU imputed from {old_value} to {new_value}")

        except Exception as e:
            print(f"Error in STU imputation: {e}")

    def _impute_gdr_values(self, df):
        """Impute GDR values marked as 'Annat' using trained model."""
        print(f"\n\nGDR IMPUTATION WITH RANDOM FOREST:")
        print("-" * 40)

        gdr_annat_mask = df['GDR'] == 'Annat'
        gdr_annat_indices = df[gdr_annat_mask].index.tolist()

        print(f"Rows with GDR = 'Annat': {len(gdr_annat_indices)}")

        if len(gdr_annat_indices) == 0:
            print("No GDR values marked as 'Annat' to impute")
            return

        if 'GDR' not in self.models:
            print("GDR model not trained. Using fallback method.")
            self._fallback_gdr_imputation(df, gdr_annat_indices)
            return

        try:
            df_missing = df[gdr_annat_mask].copy()
            features = self.feature_sets['GDR']
            X_missing = df_missing[features]

            if X_missing.isnull().any().any():
                print("Warning: NaN values in required features for GDR prediction")
                valid_mask = ~X_missing.isnull().any(axis=1)
                X_valid = X_missing[valid_mask]
                valid_indices = X_valid.index.tolist()

                if len(valid_indices) > 0:
                    X_scaled = self.scalers['GDR'].transform(X_valid)
                    predictions = self.models['GDR'].predict(X_scaled)

                    for i, idx in enumerate(valid_indices):
                        old_value = df.loc[idx, 'GDR']
                        new_value = 'Kvinna' if predictions[i] == 0 else 'Man'
                        df.loc[idx, 'GDR'] = new_value

                        self._log_imputation(idx, 'GDR', old_value, new_value, 'Random Forest prediction')
                        print(f"  Row {idx}: GDR imputed from '{old_value}' to '{new_value}'")

                # Use fallback for remaining rows
                remaining_indices = [idx for idx in gdr_annat_indices if idx not in valid_indices]
                if remaining_indices:
                    self._fallback_gdr_imputation(df, remaining_indices)
            else:
                X_scaled = self.scalers['GDR'].transform(X_missing)
                predictions = self.models['GDR'].predict(X_scaled)

                for i, idx in enumerate(gdr_annat_indices):
                    old_value = df.loc[idx, 'GDR']
                    new_value = 'Kvinna' if predictions[i] == 0 else 'Man'
                    df.loc[idx, 'GDR'] = new_value

                    self._log_imputation(idx, 'GDR', old_value, new_value, 'Random Forest prediction')
                    print(f"  Row {idx}: GDR imputed from '{old_value}' to '{new_value}'")

        except Exception as e:
            print(f"Error in GDR Random Forest imputation: {e}")
            print("Using fallback method...")
            self._fallback_gdr_imputation(df, gdr_annat_indices)

    def _fallback_gdr_imputation(self, df, indices):
        """Fallback method for GDR imputation using majority class."""
        print(f"Using fallback (majority class) for {len(indices)} rows:")

        for idx in indices:
            old_value = df.loc[idx, 'GDR']
            new_value = 'Kvinna'  # Majority class
            df.loc[idx, 'GDR'] = new_value

            self._log_imputation(idx, 'GDR', old_value, new_value, 'Majority class (fallback)')
            print(f"  Row {idx}: GDR changed from '{old_value}' to '{new_value}' (fallback)")

    def _log_imputation(self, row_idx, column, old_value, new_value, method):
        """Log an imputation change."""
        self.imputation_log.append({
            'Row': row_idx,
            'Column': column,
            'Before': old_value,
            'After': new_value,
            'Method': method
        })

    def _save_imputation_results(self, df_imputation):
        """Save imputation results and generate summary."""
        print(f"\n\n" + "="*60)
        print("IMPUTATION SUMMARY")
        print("="*60)

        if self.imputation_log:
            print(f"Total imputed cells: {len(self.imputation_log)}")

            # Summary by column
            column_counts = Counter([change['Column'] for change in self.imputation_log])
            print(f"\nSummary by column:")
            for col, count in column_counts.items():
                print(f"  {col}: {count} imputations")

            # Save imputation log
            imputation_df = pd.DataFrame(self.imputation_log)
            imputation_df.to_excel(f'{timestamp}_imputation-log.xlsx', index=False)
            print(f"\nImputation log saved as: imputation-log.xlsx")

            # Detailed log
            print(f"\nDetailed imputation log:")
            for i, change in enumerate(self.imputation_log, 1):
                print(f"{i:2d}. Row {change['Row']:3d}, Column {change['Column']:3s}: "
                      f"{str(change['Before']):10s} → {str(change['After']):10s} "
                      f"({change['Method']})")
        else:
            print("No cells were imputed")

        # Summary of imputation tracking columns
        print(f"\nImputation tracking summary:")
        print(f"  STU_IMPUTED column: {df_imputation['STU_IMPUTED'].sum()} rows marked as imputed")
        print(f"  GDR_IMPUTED column: {df_imputation['GDR_IMPUTED'].sum()} rows marked as imputed")

        # Convert GDR to numeric before saving (but keep tracking columns as is)
        df_final = df_imputation.copy()
        df_final['GDR'] = df_final['GDR'].map({'Kvinna': 0, 'Man': 1})

        # Save final dataset with imputation tracking
        output_filename = f'{timestamp}_survey-teacher-data-with-imputation.xlsx'
        df_final.to_excel(output_filename, index=False)
        print(f"\nDataset with imputation tracking saved as: {output_filename}")


def main():
    """Main execution function."""
    start_time = time.time()

    # Initialize processor
    processor = SurveyDataProcessor(
        'DataPreprocessor/Enkät om gymnasielärares syn på språkmodeller i undervisningen (Svar).xlsx'
    )

    # Load and prepare data
    processor.load_and_prepare_data()

    # Analyze missing values
    processor.analyze_missing_values()

    # Prepare training data
    processor.prepare_training_data()

    # Train models
    print(f"\n" + "="*60)
    print("MODEL TRAINING")
    print("="*60)

    stu_model, stu_scaler, stu_features = processor.train_model('STU', ['Nej', 'Ja'])
    gdr_model, gdr_scaler, gdr_features = processor.train_model('GDR', ['Kvinna', 'Man'])

    print(f"STU important features: {stu_features}")
    print(f"GDR important features: {gdr_features}")

    # Perform imputation
    final_df = processor.perform_imputation()

    # Final summary
    end_time = time.time()
    print(f"\nAnalysis completed!")
    print(f"Execution time: {round(end_time - start_time, 2)} seconds")
    print(f"Clean dataset size (for training): {len(processor.df_clean)} rows")
    print(f"Final dataset size (with imputation): {len(final_df)} rows")
    print(f"Number of imputed cells: {len(processor.imputation_log)}")


if __name__ == "__main__":
    main()

Loading and preparing survey data...
Data loaded: 223 rows

MISSING VALUES ANALYSIS
Total rows with missing values: 3

Details of rows with missing values:
Row 0: missing values in columns ['STU']
Row 1: missing values in columns ['STU']
Row 135: missing values in columns ['STU']

Gender distribution:
  Man: 121
  Kvinna: 101
  Annat: 1
Clean dataset for training: 219 rows

MODEL TRAINING

Training model for STU...
Model accuracy: 0.750
Confusion matrix saved as: Tabell_1_Konfusionsmatris_STU.png

Training model for GDR...
Model accuracy: 0.591
Confusion matrix saved as: Tabell_2_Konfusionsmatris_GDR.png
STU important features: ['BI2', 'BI3', 'BI1', 'FC2', 'SI2', 'COUNTY', 'AGE', 'PE1']
GDR important features: ['SI1', 'COUNTY', 'AGE', 'SI3', 'EE3', 'SI2', 'FC3', 'EE2']

DETAILED IMPUTATION TRACKING
Initialized imputation tracking columns:
  STU values to be imputed: 3
  GDR values to be imputed: 1

STU IMPUTATION:
----------------------------------------
Rows with missing STU values: 3