In [None]:
%pip install pandas scikit-learn imbalanced-learn matplotlib seaborn


In [None]:
%pip install numpy plotly psutil


In [None]:
# First, ensure all libraries are installed and compatible in this notebook session.
!pip install imbalanced-learn==0.10.1

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# --- Part 1: Data Preparation ---

# 1. Define the file path and the specific columns for analysis.
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"

columns_for_analysis = [
    'Principal Diagnosis Class',
    'Alcohol Related Disorder',
    'Drug Substance Disorder',
    'Opioid Related Disorder',
    'Serious Mental Illness',
    'Alcohol 12m Service',
    'Opioid 12m Service',
    'Drug/Substance 12m Service'
]

# 2. Load the dataset and select the specified columns.
try:
    df = pd.read_csv(file_path)
    df_selected = df[columns_for_analysis]
    print("Columns loaded successfully.")
except KeyError as e:
    print(f"Error: A column name was not found. Please check your spelling.")
    print(f"Details: {e}")
    raise
except FileNotFoundError:
    print("Error: File not found. Please check your file path.")
    raise

# 3. Handle non-numeric data using one-hot encoding.
df_processed = pd.get_dummies(df_selected, dummy_na=False)
df_processed = df_processed.fillna(df_processed.mean(numeric_only=True))

# 4. Separate features (X) and target (y).
target_variable = 'Principal Diagnosis Class'
y_labels = df_selected[target_variable].unique().tolist()
y_labels = [cls for cls in y_labels if pd.notna(cls) and isinstance(cls, str)]

X = df_processed.drop(columns=[f'{target_variable}_{cls}' for cls in y_labels])
y = df_processed[[f'{target_variable}_{cls}' for cls in y_labels]]
y = y.idxmax(axis=1).str.replace(f'{target_variable}_', '', regex=False)

# 5. Split data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Part 2: Hyperparameter Tuning ---
print("\n--- Hyperparameter Tuning: Finding the best n_neighbors ---")
best_k = 0
best_accuracy = 0

for k in range(1, 15, 2):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train, y_train)
    y_pred_k = knn_model.predict(X_test)
    accuracy_k = accuracy_score(y_test, y_pred_k)
    print(f"Accuracy for n_neighbors = {k}: {accuracy_k:.4f}")
    if accuracy_k > best_accuracy:
        best_accuracy = accuracy_k
        best_k = k

print(f"\nOptimal n_neighbors found: {best_k} with accuracy of {best_accuracy:.4f}")

# --- Part 3: Data Balancing with SMOTE ---
print("\n--- Balancing the Dataset with SMOTE ---")

# 1. Scale the data. Scaling is crucial before applying SMOTE.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Apply SMOTE to the training data.
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"Original training set size: {X_train.shape[0]}")
print(f"Resampled training set size: {X_train_resampled.shape[0]}")

# --- Part 4: Rerunning the Model with Optimized Parameters ---
print("\n--- Model Evaluation on Resampled Data with Optimal 'k' ---")

# 1. Train the KNN classifier with the best 'k' and resampled data.
knn_model_balanced = KNeighborsClassifier(n_neighbors=best_k)
knn_model_balanced.fit(X_train_resampled, y_train_resampled)

# 2. Make predictions on the original, un-resampled test data.
y_pred_balanced = knn_model_balanced.predict(X_test_scaled)

# 3. Evaluate the model.
accuracy_balanced = accuracy_score(y_test, y_pred_balanced)
print(f"Accuracy on original test data: {accuracy_balanced:.4f}\n")

report_balanced = classification_report(y_test, y_pred_balanced)
print("Classification Report on Resampled Data:\n", report_balanced)

In [None]:
# --- Library Setup ---
# Run this in a separate cell if needed:
# !pip install scikit-learn==1.3.2
# !pip install imbalanced-learn==0.13.0

import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')

# --- Configuration ---
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"
columns_for_analysis = [
    'Principal Diagnosis Class',
    'Alcohol Related Disorder',
    'Drug Substance Disorder',
    'Opioid Related Disorder',
    'Serious Mental Illness',
    'Alcohol 12m Service',
    'Opioid 12m Service',
    'Drug/Substance 12m Service'
]
target_variable = 'Principal Diagnosis Class'

# --- Load and Prepare Data ---
try:
    df = pd.read_csv(file_path)
    df_selected = df[columns_for_analysis]
    print("Columns loaded successfully.")
except KeyError as e:
    print("Error: Column name not found.")
    print(f"Details: {e}")
    raise
except FileNotFoundError:
    print("Error: File not found.")
    raise

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df_selected, dummy_na=False)
df_encoded.fillna(df_encoded.mean(numeric_only=True), inplace=True)

# Extract target labels
y_classes = [cls for cls in df_selected[target_variable].unique() if pd.notna(cls) and isinstance(cls, str)]
target_columns = [f'{target_variable}_{cls}' for cls in y_classes]

# Separate features and target
X = df_encoded.drop(columns=target_columns)
y = df_encoded[target_columns].idxmax(axis=1).str.replace(f'{target_variable}_', '', regex=False)

# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Hyperparameter Tuning ---
print("\nHyperparameter Tuning: Finding the best n_neighbors")
best_k = 0
best_accuracy = 0

for k in range(1, 15, 2):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"n_neighbors = {k}: Accuracy = {acc:.4f}")
    if acc > best_accuracy:
        best_accuracy = acc
        best_k = k

print(f"\nOptimal n_neighbors: {best_k} with accuracy: {best_accuracy:.4f}")

# --- Data Balancing with SMOTE ---
print("\nBalancing the Dataset with SMOTE")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"Original training size: {len(X_train)}")
print(f"Resampled training size: {len(X_train_resampled)}")

# --- Final Model Evaluation ---
print("\nModel Evaluation on Resampled Data")

model_balanced = KNeighborsClassifier(n_neighbors=best_k)
model_balanced.fit(X_train_resampled, y_train_resampled)
y_pred_balanced = model_balanced.predict(X_test_scaled)

final_accuracy = accuracy_score(y_test, y_pred_balanced)
print(f"Accuracy on test data: {final_accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_balanced))

In [None]:
pip uninstall scikit-learn imbalanced-learn -y

In [None]:
pip install scikit-learn==1.3.2 imbalanced-learn==0.10.1

In [None]:
# --- Library Imports ---
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')

# --- Configuration ---
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"
columns_for_analysis = [
    'Principal Diagnosis Class',
    'Alcohol Related Disorder',
    'Drug Substance Disorder',
    'Opioid Related Disorder',
    'Serious Mental Illness',
    'Alcohol 12m Service',
    'Opioid 12m Service',
    'Drug/Substance 12m Service'
]
target_variable = 'Principal Diagnosis Class'

# --- Load and Prepare Data ---
try:
    df = pd.read_csv(file_path)
    df_selected = df[columns_for_analysis]
    print("Columns loaded successfully.")
except KeyError as e:
    print("Column name not found.")
    print(f"Details: {e}")
    raise
except FileNotFoundError:
    print("File not found.")
    raise

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df_selected, dummy_na=False)
df_encoded.fillna(df_encoded.mean(numeric_only=True), inplace=True)

# Extract target labels
y_classes = [cls for cls in df_selected[target_variable].unique() if pd.notna(cls) and isinstance(cls, str)]
target_columns = [f'{target_variable}_{cls}' for cls in y_classes]

# Separate features and target
X = df_encoded.drop(columns=target_columns)
y = df_encoded[target_columns].idxmax(axis=1).str.replace(f'{target_variable}_', '', regex=False)

# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Hyperparameter Tuning ---
print("\nHyperparameter Tuning: Finding the best n_neighbors")
best_k = 0
best_accuracy = 0

for k in range(1, 15, 2):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"n_neighbors = {k}: Accuracy = {acc:.4f}")
    if acc > best_accuracy:
        best_accuracy = acc
        best_k = k

print(f"\nOptimal n_neighbors: {best_k} with accuracy: {best_accuracy:.4f}")

# --- Data Balancing with SMOTE ---
print("\nBalancing the Dataset with SMOTE")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"Original training size: {len(X_train)}")
print(f"Resampled training size: {len(X_train_resampled)}")

# --- Final Model Evaluation ---
print("\nModel Evaluation on Resampled Data")

model_balanced = KNeighborsClassifier(n_neighbors=best_k)
model_balanced.fit(X_train_resampled, y_train_resampled)
y_pred_balanced = model_balanced.predict(X_test_scaled)

final_accuracy = accuracy_score(y_test, y_pred_balanced)
print(f"Accuracy on test data: {final_accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_balanced))

In [None]:
pip uninstall scikit-learn imbalanced-learn -y

In [None]:
pip install scikit-learn==1.3.2 imbalanced-learn==0.10.1

In [1]:
from imblearn.over_sampling import SMOTE
print("SMOTE imported successfully.")

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\arunc\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validation.py)

In [2]:
pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1 --force-reinstall

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn==1.2.2
  Downloading scikit-learn-1.2.2.tar.gz (7.3 MB)
     ---------------------------------------- 0.0/7.3 MB ? eta -:--:--
     - -------------------------------------- 0.3/7.3 MB ? eta -:--:--
     ---- ----------------------------------- 0.8/7.3 MB 1.8 MB/s eta 0:00:04
     ------- -------------------------------- 1.3/7.3 MB 2.2 MB/s eta 0:00:03
     ----------- ---------------------------- 2.1/7.3 MB 2.6 MB/s eta 0:00:02
     --------------- ------------------------ 2.9/7.3 MB 2.8 MB/s eta 0:00:02
     -------------------- ------------------- 3.7/7.3 MB 3.1 MB/s eta 0:00:02
     ------------------------- -------------- 4.7/7.3 MB 3.3 MB/s eta 0:00:01
     ------------------------------- -------- 5.8/7.3 MB 3.6 MB/s eta 0:00:01
     -------------------------------------- - 7.1/7.3 MB 3.9 MB/s eta 0:00:01
     ---------------------------------------- 7.3/7.3 MB 3.7 MB/s eta 0:00:0

  error: subprocess-exited-with-error
  
  Preparing metadata (pyproject.toml) did not run successfully.
  exit code: 1
  
  [33 lines of output]
  Partial import of sklearn during the build process.
  Traceback (most recent call last):
    File "C:\ProgramData\anaconda3\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 353, in <module>
      main()
    File "C:\ProgramData\anaconda3\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 335, in main
      json_out['return_val'] = hook(**hook_input['kwargs'])
                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "C:\ProgramData\anaconda3\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 149, in prepare_metadata_for_build_wheel
      return hook(metadata_directory, config_settings)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "C:\Users\arunc\AppData\Local\Temp\pip-build-env-7r_umrr9\overlay\Lib\site-packages\setuptools\

In [3]:
pip install scikit-learn==1.2.2 --only-binary :all:

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement scikit-learn==1.2.2 (from versions: 1.3.1, 1.3.2, 1.4.0rc1, 1.4.0, 1.4.1.post1, 1.4.2, 1.5.0rc1, 1.5.0, 1.5.1, 1.5.2, 1.6.0rc1, 1.6.0, 1.6.1, 1.7.0rc1, 1.7.0, 1.7.1)
ERROR: No matching distribution found for scikit-learn==1.2.2


In [1]:
pip install scikit-learn==1.3.1


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn==1.3.1
  Downloading scikit_learn-1.3.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Downloading scikit_learn-1.3.1-cp312-cp312-win_amd64.whl (9.1 MB)
   ---------------------------------------- 0.0/9.1 MB ? eta -:--:--
   --- ------------------------------------ 0.8/9.1 MB 4.2 MB/s eta 0:00:02
   -------- ------------------------------- 1.8/9.1 MB 4.6 MB/s eta 0:00:02
   ----------- ---------------------------- 2.6/9.1 MB 4.3 MB/s eta 0:00:02
   ------------- -------------------------- 3.1/9.1 MB 4.3 MB/s eta 0:00:02
   ----------------- ---------------------- 3.9/9.1 MB 3.9 MB/s eta 0:00:02
   -------------------- ------------------- 4.7/9.1 MB 3.8 MB/s eta 0:00:02
   ------------------------ --------------- 5.5/9.1 MB 3.8 MB/s eta 0:00:01
   --------------------------- ------------ 6.3/9.1 MB 3.8 MB/s eta 0:00:01
   -------------------------------- ------- 7.3/9.1 MB 3.9 MB/s eta 0:00

In [None]:
pip install scikit-learn --upgrade


In [1]:
import sys
print(sys.executable)


C:\ProgramData\anaconda3\python.exe


In [2]:
conda install scikit-learn=1.3.1


error: bad escape \P at position 28

In [3]:
!conda install scikit-learn=1.3.1 -y


Channels:
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... failed



PackagesNotFoundError: The following packages are not available from current channels:

  - scikit-learn=1.3.1*

Current channels:

  - defaults
  - https://repo.anaconda.com/pkgs/main
  - https://repo.anaconda.com/pkgs/r
  - https://repo.anaconda.com/pkgs/msys2

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.




In [4]:
conda install -c conda-forge scikit-learn=1.3.1


error: bad escape \P at position 28

In [5]:
!conda install -c conda-forge scikit-learn=1.3.1 -y


Channels:
 - conda-forge
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\ProgramData\anaconda3

  added / updated specs:
    - scikit-learn=1.3.1


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2025.8.3   |       h4c7d964_0         151 KB  conda-forge
    certifi-2025.8.3           |     pyhd8ed1ab_0         155 KB  conda-forge
    conda-24.11.3              |  py312h2e8e312_0         1.1 MB  conda-forge
    libexpat-2.6.3             |       he0c23c2_0         136 KB  conda-forge
    libsqlite-3.50.4           |       hf5d6505_0         1.2 MB  conda-forge
    libzlib-1.2.13             |       h2466b09_6          55 KB  conda-forge
    openssl-3.5.2              |       h725018a_0         8.8 MB  conda-forge
    python-3.12.3            


EnvironmentNotWritableError: The current user does not have write permissions to the target environment.
  environment location: C:\ProgramData\anaconda3




In [6]:
%pip install scikit-learn==1.3.1

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn==1.3.1
  Using cached scikit_learn-1.3.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Using cached scikit_learn-1.3.1-cp312-cp312-win_amd64.whl (9.1 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.7.1
    Uninstalling scikit-learn-1.7.1:
      Successfully uninstalled scikit-learn-1.7.1
Successfully installed scikit-learn-1.3.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import sklearn
print(sklearn.__version__)


1.3.1


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"
df = pd.read_csv(file_path)

# Selected columns for analysis
selected_columns = [
    "Sexual Orientation",
    "Living Situation",
    "Household Composition",
    "Religious Preference",
    "Employment Status",
    "Number Of Hours Worked Each Week",
    "Education Status",
    "Mental Illness"
]

# Filter the dataset to include only selected columns
df_selected = df[selected_columns].copy()

# Drop rows with missing values
df_selected.dropna(inplace=True)

# Encode categorical variables
label_encoders = {}
for column in selected_columns:
    le = LabelEncoder()
    df_selected[column] = le.fit_transform(df_selected[column].astype(str))
    label_encoders[column] = le

# Split features and target
X = df_selected.drop("Mental Illness", axis=1)
y = df_selected["Mental Illness"]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict and evaluate
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output results
print(f"Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(report)


Accuracy: 0.9638

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.01      0.03       495
           1       0.34      0.05      0.09       218
           2       0.96      1.00      0.98     19136

    accuracy                           0.96     19849
   macro avg       0.65      0.35      0.37     19849
weighted avg       0.95      0.96      0.95     19849



In [3]:
import pandas as pd

# Load your dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"
df = pd.read_csv(file_path)

# Display unique values in the 'Mental Illness' column
print("Unique values in 'Mental Illness' column:")
print(df["Mental Illness"].value_counts())


Unique values in 'Mental Illness' column:
Mental Illness
YES        95693
NO          2563
UNKNOWN      988
Name: count, dtype: int64


In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"
df = pd.read_csv(file_path)

# Selected columns for analysis
selected_columns = [
    "Sexual Orientation",
    "Living Situation",
    "Household Composition",
    "Religious Preference",
    "Employment Status",
    "Number Of Hours Worked Each Week",
    "Education Status",
    "Mental Illness"
]

# Filter and clean the dataset
df_selected = df[selected_columns].copy()
df_selected.dropna(inplace=True)

# Encode categorical variables
label_encoders = {}
for column in selected_columns:
    le = LabelEncoder()
    df_selected[column] = le.fit_transform(df_selected[column].astype(str))
    label_encoders[column] = le

# Split features and target
X = df_selected.drop("Mental Illness", axis=1)
y = df_selected["Mental Illness"]

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)

# Train KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train

SyntaxError: incomplete input (231720427.py, line 53)

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"
df = pd.read_csv(file_path)

# Selected columns for analysis
selected_columns = [
    "Sexual Orientation",
    "Living Situation",
    "Household Composition",
    "Religious Preference",
    "Employment Status",
    "Number Of Hours Worked Each Week",
    "Education Status",
    "Mental Illness"
]

# Filter and clean the dataset
df_selected = df[selected_columns].copy()
df_selected.dropna(inplace=True)

# Encode categorical variables
label_encoders = {}
for column in selected_columns:
    le = LabelEncoder()
    df_selected[column] = le.fit_transform(df_selected[column].astype(str))
    label_encoders[column] = le

# Split features and target
X = df_selected.drop("Mental Illness", axis=1)
y = df_selected["Mental Illness"]

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)

# Train KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)  #  Fixed the missing parenthesis here

# Predict and evaluate
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output results
print(f"Accuracy after SMOTE: {accuracy:.4f}\n")
print("Classification Report after SMOTE:")
print(report)


ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\arunc\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validation.py)

In [6]:
# Upgrade scikit-learn and imbalanced-learn
!pip install --upgrade scikit-learn imbalanced-learn


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Using cached scikit_learn-1.7.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting imbalanced-learn
  Using cached imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Using cached imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
Installing collected packages: scikit-learn, imbalanced-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.1
    Uninstalling scikit-learn-1.3.1:
      Successfully uninstalled scikit-learn-1.3.1
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.10.1
    Uninstalling imbalanced-learn-0.10.1:
      Successfully uninstalled imbalanced-learn-0.10.1
Successfully installed imbalanced-learn-0.13.0 scikit-

  You can safely remove it manually.


In [7]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Load the dataset with encoding fallback
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"
try:
    df = pd.read_csv(file_path)
except UnicodeDecodeError:
    df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Selected columns
selected_columns = [
    "Sexual Orientation",
    "Living Situation",
    "Household Composition",
    "Religious Preference",
    "Employment Status",
    "Number Of Hours Worked Each Week",
    "Education Status",
    "Mental Illness"
]

# Filter and clean
df_selected = df[selected_columns].copy()
df_selected.dropna(inplace=True)

# Encode categorical variables
label_encoders = {}
for column in selected_columns:
    le = LabelEncoder()
    df_selected[column] = le.fit_transform(df_selected[column].astype(str))
    label_encoders[column] = le

# Split features and target
X = df_selected.drop("Mental Illness", axis=1)
y = df_selected["Mental Illness"]

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)

# Train KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict and evaluate
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output results
print(f"Accuracy after SMOTE: {accuracy:.4f}\n")
print("Classification Report after SMOTE:")
print(report)


ImportError: cannot import name 'tarfile_extractall' from 'sklearn.utils.fixes' (C:\Users\arunc\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\fixes.py)

In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = r"C:\Users\arunc\OneDrive\Desktop\Python Project\Female data consolidated\Female complete Data CSV.csv"
df = pd.read_csv(file_path)

# Select relevant columns
selected_columns = [
    'Sexual Orientation',
    'Living Situation',
    'Household Composition',
    'Religious Preference',
    'Employment Status',
    'Number Of Hours Worked Each Week',
    'Education Status',
    'Mental Illness'
]
df_selected = df[selected_columns].copy()

# Encode categorical variables
label_encoders = {}
for column in df_selected.columns:
    if df_selected[column].dtype == 'object':
        le = LabelEncoder()
        df_selected[column] = le.fit_transform(df_selected[column].astype(str))
        label_encoders[column] = le

# Separate features and target
X = df_selected.drop('Mental Illness', axis=1)
y = df_selected['Mental Illness']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine resampled data into a DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['Mental Illness'] = y_resampled

# Save the resampled dataset (optional)
df_resampled.to_csv("resampled_female_data.csv", index=False)

print("SMOTE applied successfully. Resampled data saved to 'resampled_female_data.csv'.")


SMOTE applied successfully. Resampled data saved to 'resampled_female_data.csv'.
