In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import joblib
# Load dataset
# Load dataset
# Load dataset
def load_data(filepath):
    data = pd.read_csv(filepath)
    # Trim column names to remove any leading/trailing spaces
    data.columns = data.columns.str.strip()
    # Trim values in all cells to remove leading/trailing spaces
    string_columns = data.select_dtypes(include=['object']).columns
    data[string_columns] = data[string_columns].apply(lambda col: col.str.strip() if col.name in string_columns else col)
    data = data.map(lambda x: x.strip() if isinstance(x, str) else x)
    # Replace empty strings or spaces with NaN and drop them
    data.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    data.dropna(inplace=True)
    return data



# Preprocess data
# Preprocess data
# Preprocess data
# Preprocess data
def preprocess_data(data):
    # Encode categorical variables
    le_gender = LabelEncoder()
    le_marital_status = LabelEncoder()
    le_schizophrenia = LabelEncoder()

    data['Gender'] = le_gender.fit_transform(data['Gender'])
    data['Marital_Status'] = le_marital_status.fit_transform(data['Marital_Status'])
    data['Schizophrenia'] = le_schizophrenia.fit_transform(data['Schizophrenia'])

    for column in ['Fatigue', 'Slowing', 'Pain', 'Hygiene', 'Movement']:
        data[column] = data[column].round(2)

    # Select only the features for the model
    feature_columns = ['Gender', 'Marital_Status', 'Fatigue', 'Slowing', 'Pain', 'Hygiene', 'Movement']
    X = data[feature_columns]
    y = data['Schizophrenia']

    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y, le_gender, le_marital_status, le_schizophrenia, scaler, feature_columns

def save_model(model, scaler, le_gender, le_marital_status, le_schizophrenia, feature_columns, filename="model.pkl"):
    """
    Save the trained model, scaler, label encoders, and feature column names as a .pkl file.
    """
    model_data = {
        "model": model,
        "scaler": scaler,
        "le_gender": le_gender,
        "le_marital_status": le_marital_status,
        "le_schizophrenia": le_schizophrenia,
        "feature_columns": feature_columns  # Save the feature columns
    }
    joblib.dump(model_data, filename)
    print(f"Model and preprocessing objects saved to {filename}")




# Train and tune model
# Train model with specified algorithm
def train_model(X_train, y_train, algorithm="random_forest"):
    if algorithm == "random_forest":
        # Random Forest with hyperparameter tuning
        rf = RandomForestClassifier(random_state=42)

        # Define the parameter grid
        param_dist = {
            'n_estimators': [100, 200, 300, 500, 1000],
            'max_depth': [10, 20, 30, 50, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None],
            'bootstrap': [True, False]
        }

        # Randomized Search
        rf_random = RandomizedSearchCV(
            estimator=rf,
            param_distributions=param_dist,
            n_iter=100,  # Number of combinations to try
            cv=5,        # Cross-validation folds
            verbose=2,
            random_state=42,
            n_jobs=-1    # Use all available cores
        )

        rf_random.fit(X_train, y_train)
        print(f"Best Parameters: {rf_random.best_params_}")
        return rf_random.best_estimator_

    elif algorithm == "svm":
        from sklearn.svm import SVC
        model = SVC(
            kernel="rbf",
            C=10,
            gamma=0.1,
            probability=True,
            class_weight="balanced"
        )
        param_grid = {
            'C': [0.1, 1, 10, 100],     # Regularization parameter
            'gamma': [1, 0.1, 0.01, 0.001],  # Kernel coefficient
            'kernel': ['rbf', 'linear']  # Kernels to try
        }
        
        model.fit(X_train, y_train)
        return model
    else:
        raise ValueError("Unsupported algorithm. Choose 'random_forest' or 'svm'.")



    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print("Best Parameters:", grid_search.best_params_)
    return grid_search.best_estimator_

# Predict Schizophrenia stage
def predict_stage(model, scaler, le_schizophrenia, user_input):
    # Prepare input data
    user_input_scaled = scaler.transform([user_input])
    prediction = model.predict(user_input_scaled)
    return le_schizophrenia.inverse_transform(prediction)[0]

# Main workflow
def main():
    # Hard-coded file path
    filepath = "C:/myproject/SchizophreniaSymptomnsData3.csv"  # Replace with your actual file path

    # Load and preprocess data
    data = load_data(filepath)
    X, y, le_gender, le_marital_status, le_schizophrenia, scaler, feature_columns = preprocess_data(data)

    # Split dataset into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.16, random_state= 42)

    # Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    print("Class distribution after SMOTE:")
    print(pd.Series(y_train_resampled).value_counts())

    # Train the model using SVM
    algorithm = "svm"  # Use SVM
    model = train_model(X_train_resampled, y_train_resampled, algorithm=algorithm)

    # Test the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    test_data_with_predictions = pd.DataFrame(X_test, columns=feature_columns)

    y_test = y_test.ravel()  # Convert y_test to 1D if it's not already
    y_pred = y_pred.ravel()  # Convert y_pred to 1D if it's not already

    print("\nPredictions on the test set:")
    predicted_df = pd.DataFrame({
    'Actual_Schizophrenia': le_schizophrenia.inverse_transform(y_test),
    'Predicted_Schizophrenia': le_schizophrenia.inverse_transform(y_pred)})
    print(predicted_df.head())

    test_data_with_predictions['Actual_Schizophrenia'] = le_schizophrenia.inverse_transform(y_test)
    test_data_with_predictions['Predicted_Schizophrenia'] = le_schizophrenia.inverse_transform(y_pred)
    test_data_with_predictions.to_csv('predictions_with_actual_and_predicted.csv', index=False)

    print(f"Predictions saved to 'predictions_with_actual_and_predicted.csv'")

    print("\nTest set with predictions saved to CSV:")
    print(test_data_with_predictions.head())


    save_model(model, scaler, le_gender, le_marital_status, le_schizophrenia, feature_columns)

    # User Input for Prediction and Validation
    while True:
        print("\nEnter details for prediction (or type 'exit' to quit):")
        try:
            # Input from user
            name = input("Name: ").strip()  # Name is only for display, not used in features
            age = float(input("Age: "))
            gender = input("Gender (Male/Female): ").strip()
            marital_status = input("Marital Status (Single/Married/Widowed/Divorced): ").strip()
            fatigue = float(input("Fatigue (value between -1 and 1): "))
            slowing = float(input("Slowing (value between -1 and 1): "))
            pain = float(input("Pain (value between -1 and 1): "))
            hygiene = float(input("Hygiene (value between -1 and 1): "))
            movement = float(input("Movement (value between -1 and 1): "))

            # Prepare input for prediction
            input_data = np.array([[
                le_gender.transform([gender])[0],
                le_marital_status.transform([marital_status])[0],
                round(fatigue,2),
                round(slowing, 2),
                round(pain, 2),
                round(hygiene, 2),
                round(movement, 2)
            ]])  # Input must match training features

            # Scale input data
            input_data_scaled = scaler.transform(input_data)

            # Predict using the model
            prediction = model.predict(input_data_scaled)
            predicted_stage = le_schizophrenia.inverse_transform(prediction)[0]
            print(f"\n{name} is predicted to have: {predicted_stage}")

        except Exception as e:
            print(f"Error: {e}")

        if input("Do you want to continue? (yes/no): ").strip().lower() != "yes":
            break
            

if __name__ == "__main__":
    main()


Class distribution after SMOTE:
Schizophrenia
3    1360
0    1360
1    1360
2    1360
4    1360
Name: count, dtype: int64


  y_test = y_test.ravel()  # Convert y_test to 1D if it's not already


Model Accuracy: 96.17%

Predictions on the test set:
  Actual_Schizophrenia Predicted_Schizophrenia
0   Elevated Proneness      Moderate Proneness
1       High Proneness          High Proneness
2   Moderate Proneness      Elevated Proneness
3       High Proneness          High Proneness
4   Elevated Proneness      Elevated Proneness
Predictions saved to 'predictions_with_actual_and_predicted.csv'

Test set with predictions saved to CSV:
     Gender  Marital_Status   Fatigue   Slowing      Pain   Hygiene  Movement  \
0  1.012346       -0.422095 -1.359709  0.317104  1.268401 -0.988235  0.050673   
1 -0.987804        1.351997  0.891291  0.612163  1.102967  1.022764  0.712673   
2 -0.987804        0.464951 -1.338848  1.378839 -0.410982 -0.988235 -0.053853   
3  1.012346        1.351997 -0.009444  1.833880 -1.241143  1.227272  1.095936   
4  1.012346        1.351997  1.093266 -0.220532 -1.462616 -0.238371  0.886884   

  Actual_Schizophrenia Predicted_Schizophrenia  
0   Elevated Proneness 

Name:  njbd
Age:  55
Gender (Male/Female):  Female
Marital Status (Single/Married/Widowed/Divorced):  Widowed
Fatigue (value between -1 and 1):  0.0898
Slowing (value between -1 and 1):  0.0658
Pain (value between -1 and 1):  0.0666
Hygiene (value between -1 and 1):  0.01235
Movement (value between -1 and 1):  0.03333





njbd is predicted to have: Low Proneness
