In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import joblib
from openpyxl import Workbook
from openpyxl.styles import PatternFill

# Load dataset
def load_data(filepath):
    data = pd.read_csv(filepath)
    # Trim column names to remove any leading/trailing spaces
    data.columns = data.columns.str.strip()
    # Trim values in all cells to remove leading/trailing spaces
    string_columns = data.select_dtypes(include=['object']).columns
    data[string_columns] = data[string_columns].apply(lambda col: col.str.strip() if col.name in string_columns else col)
    for col in string_columns:
        data[col] = data[col].str.strip()    # Replace empty strings or spaces with NaN and drop them
        data.replace(r'^\s*$', np.nan, regex=True, inplace=True)
        data.dropna(inplace=True)
        return data

# Preprocess data
def preprocess_data(data):
    # Encode categorical variables
    le_gender = LabelEncoder()
    le_marital_status = LabelEncoder()
    le_schizophrenia = LabelEncoder()

    data['Gender'] = le_gender.fit_transform(data['Gender'])
    data['Marital_Status'] = le_marital_status.fit_transform(data['Marital_Status'])
    data['Schizophrenia'] = le_schizophrenia.fit_transform(data['Schizophrenia'])

    for column in ['Fatigue', 'Slowing', 'Pain', 'Hygiene', 'Movement']:
        data[column] = data[column].round(2)

    # Select only the features for the model
    feature_columns = ['Gender', 'Marital_Status', 'Fatigue', 'Slowing', 'Pain', 'Hygiene', 'Movement']
    X = data[feature_columns]
    y = data['Schizophrenia']

    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y, le_gender, le_marital_status, le_schizophrenia, scaler, feature_columns

def save_model(model, scaler, le_gender, le_marital_status, le_schizophrenia, feature_columns, filename="model.pkl"):
    model_data = {
        "model": model,
        "scaler": scaler,
        "le_gender": le_gender,
        "le_marital_status": le_marital_status,
        "le_schizophrenia": le_schizophrenia,
        "feature_columns": feature_columns
    }
    joblib.dump(model_data, filename)
    print(f"Model and preprocessing objects saved to {filename}")

# Train and tune model
def train_model(X_train, y_train, algorithm="random_forest"):
    if algorithm == "random_forest":
        rf = RandomForestClassifier(random_state=42)
        rf.fit(X_train, y_train)
        return rf
    elif algorithm == "svm":
        model = SVC(kernel="rbf", C=10, gamma=0.1, probability=True, class_weight="balanced")
        model.fit(X_train, y_train)
        return model
    elif algorithm == "xgboost":
        model = XGBClassifier(random_state=42, eval_metric='mlogloss')
        model.fit(X_train, y_train)
        return model
    else:
        raise ValueError("Unsupported algorithm. Choose 'random_forest' or 'svm'.")

# Save data to Excel with colored rows based on schizophrenia levels
def save_test_data_to_excel(test_data, test_labels, le_schizophrenia, filepath="test_data.xlsx"):
    # Assign colors based on schizophrenia levels
    color_map = {
        0: "FFDDC1",  # Elevated Proneness (light red)
        1: "C1E1C1",  # Very High Proneness (light green)
        2: "D1E1FF",  # High Proneness (light blue)
        3: "FFDDFF",  # Low Proneness (light pink)
        4: "FFF5C1"   # Moderate Proneness (light yellow)
    }
    # Create an Excel writer
    with pd.ExcelWriter(filepath, engine='openpyxl') as writer:
        test_data['Schizophrenia'] = le_schizophrenia.inverse_transform(test_labels)
        test_data.to_excel(writer, index=False, sheet_name="Test Data")
        worksheet = writer.sheets["Test Data"]

        # Apply coloring based on schizophrenia levels
        for i, row in test_data.iterrows():
            schizophrenia_level = row['Schizophrenia']
            color = color_map.get(schizophrenia_level, "FFFFFF")  # Default white if not found
            fill = PatternFill(start_color=color, end_color=color, fill_type="solid")
            worksheet[f"A{i+2}"].fill = fill  # Apply color to the first column as an example (expand as needed)

    print(f"Test data saved to {filepath}")

# Main workflow
def main():
    filepath = "SchizophreniaSymptomnsData.csv"

    # Load and preprocess data
    data = load_data(filepath)
    X, y, le_gender, le_marital_status, le_schizophrenia, scaler, feature_columns = preprocess_data(data)

    # Ask user for algorithm choice
    algorithm = input("Which algorithm do you want to use? (random_forest/svm/xgboost): ").strip().lower()

    # Ask for test size
    test_size = float(input("Enter test size (e.g., 0.2 for 20% of data): ").strip())
    
    # Split dataset into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=39)
    
    # Show the rows used for testing
    print(f"Rows used for testing (test_size={test_size}):")
    print(X_test)

    # Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Train the model
    model = train_model(X_train_resampled, y_train_resampled, algorithm=algorithm)

    # Test the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")

    save_model(model, scaler, le_gender, le_marital_status, le_schizophrenia, feature_columns)

    # Save the test data with schizophrenia level coloring to an Excel file
    save_test_data_to_excel(pd.DataFrame(X_test), y_test, le_schizophrenia, "test_data_colored.xlsx")

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'xgboost'