# BACKWARD FEATURE ELIMINATION USING SVM MODEL.

Importing important libraries

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from itertools import combinations

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Importing the dataset

In [35]:
df = pd.read_csv('Maternal Health Risk Data Set.csv')
df.head(5)

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


Checking for duplicate values and removing it.

In [36]:
# Check for duplicates and count them
print(f"Total number of duplicates: {df.duplicated().sum()}")

# Display all rows that are duplicated, sorted by all columns
duplicates = df.loc[df.duplicated(keep=False)].sort_values(by=df.columns.to_list())
print(duplicates)

# Remove duplicates
df = df.drop_duplicates()

# Verify duplicates are removed
print(f"Total number of duplicates after removal: {df.duplicated().sum()}")

Total number of duplicates: 562
     Age  SystolicBP  DiastolicBP    BS  BodyTemp  HeartRate  RiskLevel
670   10         100           50   6.0      99.0         70   mid risk
849   10         100           50   6.0      99.0         70   mid risk
552   12          90           60   7.5     102.0         60   low risk
940   12          90           60   7.5     102.0         60   low risk
543   12          90           60   7.5     102.0         66   low risk
..   ...         ...          ...   ...       ...        ...        ...
553   60         120           85  15.0      98.0         60   mid risk
772   60         120           85  15.0      98.0         60   mid risk
818   60         120           85  15.0      98.0         60   mid risk
114   63         140           90  15.0      98.0         90  high risk
502   63         140           90  15.0      98.0         90  high risk

[866 rows x 7 columns]
Total number of duplicates after removal: 0


Normalizing the dataset

In [37]:
# Separate features and target variable
X = df.drop('RiskLevel', axis=1)
y = df['RiskLevel']

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the feature set
X_scaled = scaler.fit_transform(X)

# Convert the scaled features back to a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Combine the scaled features with the target variable
df_scaled = pd.concat([X_scaled_df, y.reset_index(drop=True)], axis=1)

# Display the first few rows of the normalized data
print(df_scaled.head())

        Age  SystolicBP  DiastolicBP        BS  BodyTemp  HeartRate  RiskLevel
0 -0.305021    1.089310     0.333484  2.354439 -0.491351   1.479009  high risk
1  0.422139    1.649455     1.061321  1.646744 -0.491351  -0.484676  high risk
2 -0.014157   -1.151273    -0.394352 -0.122492  0.927758   0.742627  high risk
3  0.058559    1.649455     0.697402 -0.476340 -0.491351  -0.484676  high risk
4  0.422139    0.529164    -1.122188 -0.794802 -0.491351   0.251706   low risk


# SVM Model Analysis

### With all six features

In [38]:
# Print the features being considered
features = X.columns.tolist()
print("Features being considered for the model:")
print(np.array(features))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the SVM model
svm_model = SVC(kernel='linear', random_state=42)

# Train the SVM model
svm_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nAccuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Features being considered for the model:
['Age' 'SystolicBP' 'DiastolicBP' 'BS' 'BodyTemp' 'HeartRate']

Accuracy: 0.6483516483516484
F1 Score: 0.5434223457330004

Classification Report:
              precision    recall  f1-score   support

   high risk       0.65      0.83      0.73        18
    low risk       0.66      0.94      0.77        47
    mid risk       0.00      0.00      0.00        26

    accuracy                           0.65        91
   macro avg       0.44      0.59      0.50        91
weighted avg       0.47      0.65      0.54        91



### With five features each time

In [39]:
# Dictionary to store results
results = {}

# Iterate through each feature to leave one out at a time
for feature in X.columns:

    # Create a new DataFrame without the current feature
    X_temp = X.drop(columns=[feature])

    # Normalize the feature set
    X_scaled_temp = scaler.fit_transform(X_temp)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled_temp, y, test_size=0.2, random_state=42)

    # Initialize the SVM model
    svm_model = SVC(kernel='linear', random_state=42)

    # Train the SVM model
    svm_model.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = svm_model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Store the results
    results[feature] = {
        'Features': X_temp.columns.tolist(),
        'Accuracy': accuracy,
        'F1 Score': f1
    }

# Output all results
print("\nSummary of results:")
for feature, result in results.items():
    print(f"\nLeaving out feature: {feature}")
    print(f"Features considered: {result['Features']}")
    print(f"Accuracy: {result['Accuracy']}")
    print(f"F1 Score: {result['F1 Score']}")


Summary of results:

Leaving out feature: Age
Features considered: ['SystolicBP', 'DiastolicBP', 'BS', 'BodyTemp', 'HeartRate']
Accuracy: 0.6483516483516484
F1 Score: 0.5434223457330004

Leaving out feature: SystolicBP
Features considered: ['Age', 'DiastolicBP', 'BS', 'BodyTemp', 'HeartRate']
Accuracy: 0.6373626373626373
F1 Score: 0.5336837075967511

Leaving out feature: DiastolicBP
Features considered: ['Age', 'SystolicBP', 'BS', 'BodyTemp', 'HeartRate']
Accuracy: 0.6483516483516484
F1 Score: 0.5434223457330004

Leaving out feature: BS
Features considered: ['Age', 'SystolicBP', 'DiastolicBP', 'BodyTemp', 'HeartRate']
Accuracy: 0.5934065934065934
F1 Score: 0.4917844060701203

Leaving out feature: BodyTemp
Features considered: ['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'HeartRate']
Accuracy: 0.6373626373626373
F1 Score: 0.5287713576745834

Leaving out feature: HeartRate
Features considered: ['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'BodyTemp']
Accuracy: 0.6483516483516484
F1 Score: 0.

Summary: When we drop heart rate or Diastolic BP or Age, the accuracy seems to be high

### With four features

In [40]:
# Dictionary to store results
results = {}

# Iterate through each pair of features to leave out at a time
for feature_pair in combinations(X.columns, 2):

    # Create a new DataFrame without the current feature pair
    features_to_drop = list(feature_pair)
    X_temp = X.drop(columns=features_to_drop)

    # Normalize the feature set
    X_scaled_temp = scaler.fit_transform(X_temp)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled_temp, y, test_size=0.2, random_state=42)

    # Initialize the SVM model
    svm_model = SVC(kernel='linear', random_state=42)

    # Train the SVM model
    svm_model.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = svm_model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Store the results
    results[", ".join(features_to_drop)] = {
        'Features': X_temp.columns.tolist(),
        'Accuracy': accuracy,
        'F1 Score': f1
    }

# Output all results
print("\nSummary of results:")
for feature_pair, result in results.items():
    print(f"\nLeaving out features: {feature_pair}")
    print(f"Features considered: {result['Features']}")
    print(f"Accuracy: {result['Accuracy']}")
    print(f"F1 Score: {result['F1 Score']}")


Summary of results:

Leaving out features: Age, SystolicBP
Features considered: ['DiastolicBP', 'BS', 'BodyTemp', 'HeartRate']
Accuracy: 0.6373626373626373
F1 Score: 0.5336837075967511

Leaving out features: Age, DiastolicBP
Features considered: ['SystolicBP', 'BS', 'BodyTemp', 'HeartRate']
Accuracy: 0.6483516483516484
F1 Score: 0.5434223457330004

Leaving out features: Age, BS
Features considered: ['SystolicBP', 'DiastolicBP', 'BodyTemp', 'HeartRate']
Accuracy: 0.6043956043956044
F1 Score: 0.5028165112198726

Leaving out features: Age, BodyTemp
Features considered: ['SystolicBP', 'DiastolicBP', 'BS', 'HeartRate']
Accuracy: 0.6373626373626373
F1 Score: 0.5287713576745834

Leaving out features: Age, HeartRate
Features considered: ['SystolicBP', 'DiastolicBP', 'BS', 'BodyTemp']
Accuracy: 0.6483516483516484
F1 Score: 0.5604052980850412

Leaving out features: SystolicBP, DiastolicBP
Features considered: ['Age', 'BS', 'BodyTemp', 'HeartRate']
Accuracy: 0.6483516483516484
F1 Score: 0.539955

Summary: When we drop (Age, DiastolicBP) or (Age, HeartRate) or (SystolicBP, DiastolicBP) or (DiastolicBP, HeartRate), we get the highest accuracy.

### With three features

In [41]:
# Dictionary to store results
results = {}

# Iterate through each combination of three features to leave out at a time
for feature_combination in combinations(X.columns, 3):

    # Create a new DataFrame without the current combination of features
    features_to_drop = list(feature_combination)
    X_temp = X.drop(columns=features_to_drop)

    # Normalize the feature set
    X_scaled_temp = scaler.fit_transform(X_temp)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled_temp, y, test_size=0.2, random_state=42)

    # Initialize the SVM model
    svm_model = SVC(kernel='linear', random_state=42)

    # Train the SVM model
    svm_model.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = svm_model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Store the results
    results[", ".join(features_to_drop)] = {
        'Features': X_temp.columns.tolist(),
        'Accuracy': accuracy,
        'F1 Score': f1
    }

# Output all results
print("\nSummary of results:")
for feature_combination, result in results.items():
    print(f"\nLeaving out features: {feature_combination}")
    print(f"Features considered: {result['Features']}")
    print(f"Accuracy: {result['Accuracy']}")
    print(f"F1 Score: {result['F1 Score']}")


Summary of results:

Leaving out features: Age, SystolicBP, DiastolicBP
Features considered: ['BS', 'BodyTemp', 'HeartRate']
Accuracy: 0.6483516483516484
F1 Score: 0.5399554846002355

Leaving out features: Age, SystolicBP, BS
Features considered: ['DiastolicBP', 'BodyTemp', 'HeartRate']
Accuracy: 0.5384615384615384
F1 Score: 0.4384615384615385

Leaving out features: Age, SystolicBP, BodyTemp
Features considered: ['DiastolicBP', 'BS', 'HeartRate']
Accuracy: 0.6373626373626373
F1 Score: 0.5287713576745834

Leaving out features: Age, SystolicBP, HeartRate
Features considered: ['DiastolicBP', 'BS', 'BodyTemp']
Accuracy: 0.6263736263736264
F1 Score: 0.5270903010033445

Leaving out features: Age, DiastolicBP, BS
Features considered: ['SystolicBP', 'BodyTemp', 'HeartRate']
Accuracy: 0.6043956043956044
F1 Score: 0.5029832965055637

Leaving out features: Age, DiastolicBP, BodyTemp
Features considered: ['SystolicBP', 'BS', 'HeartRate']
Accuracy: 0.6373626373626373
F1 Score: 0.5287713576745834



Summary: When we drop (Age, SystolicBP, DiastolicBP) or (Age, DiastolicBP, HeartRate) or (SystolicBP, DiastolicBP, HeartRate) we get the highest accuracy.

### With two features

In [42]:
# Dictionary to store results
results = {}

# Iterate through each combination of four features to leave out at a time
for feature_combination in combinations(X.columns, 4):

    # Create a new DataFrame without the current combination of features
    features_to_drop = list(feature_combination)
    X_temp = X.drop(columns=features_to_drop)

    # Normalize the feature set
    X_scaled_temp = scaler.fit_transform(X_temp)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled_temp, y, test_size=0.2, random_state=42)

    # Initialize the SVM model
    svm_model = SVC(kernel='linear', random_state=42)

    # Train the SVM model
    svm_model.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = svm_model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Store the results
    results[", ".join(features_to_drop)] = {
        'Features': X_temp.columns.tolist(),
        'Accuracy': accuracy,
        'F1 Score': f1
    }

# Output all results
print("\nSummary of results:")
for feature_combination, result in results.items():
    print(f"\nLeaving out features: {feature_combination}")
    print(f"Features considered: {result['Features']}")
    print(f"Accuracy: {result['Accuracy']}")
    print(f"F1 Score: {result['F1 Score']}")


Summary of results:

Leaving out features: Age, SystolicBP, DiastolicBP, BS
Features considered: ['BodyTemp', 'HeartRate']
Accuracy: 0.5274725274725275
F1 Score: 0.4115973312401883

Leaving out features: Age, SystolicBP, DiastolicBP, BodyTemp
Features considered: ['BS', 'HeartRate']
Accuracy: 0.6373626373626373
F1 Score: 0.5287713576745834

Leaving out features: Age, SystolicBP, DiastolicBP, HeartRate
Features considered: ['BS', 'BodyTemp']
Accuracy: 0.6483516483516484
F1 Score: 0.5399554846002355

Leaving out features: Age, SystolicBP, BS, BodyTemp
Features considered: ['DiastolicBP', 'HeartRate']
Accuracy: 0.5494505494505495
F1 Score: 0.4398395255538113

Leaving out features: Age, SystolicBP, BS, HeartRate
Features considered: ['DiastolicBP', 'BodyTemp']
Accuracy: 0.5164835164835165
F1 Score: 0.40310483167626016

Leaving out features: Age, SystolicBP, BodyTemp, HeartRate
Features considered: ['DiastolicBP', 'BS']
Accuracy: 0.6373626373626373
F1 Score: 0.5287713576745834

Leaving out

Summary: When we drop (Age, SystolicBP, DiastolicBP, HeartRate) we get the highest accuracy.

### With one feature

In [43]:
# Dictionary to store results
results = {}

# Iterate through each combination of five features to leave out at a time
for feature_combination in combinations(X.columns, 5):

    # Create a new DataFrame without the current combination of features
    features_to_drop = list(feature_combination)
    X_temp = X.drop(columns=features_to_drop)

    # Normalize the feature set
    X_scaled_temp = scaler.fit_transform(X_temp)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled_temp, y, test_size=0.2, random_state=42)

    # Initialize the SVM model
    svm_model = SVC(kernel='linear', random_state=42)

    # Train the SVM model
    svm_model.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = svm_model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Store the results
    results[", ".join(features_to_drop)] = {
        'Features': X_temp.columns.tolist(),
        'Accuracy': accuracy,
        'F1 Score': f1
    }

# Output all results
print("\nSummary of results:")
for feature_combination, result in results.items():
    print(f"\nLeaving out features: {feature_combination}")
    print(f"Features considered: {result['Features']}")
    print(f"Accuracy: {result['Accuracy']}")
    print(f"F1 Score: {result['F1 Score']}")


Summary of results:

Leaving out features: Age, SystolicBP, DiastolicBP, BS, BodyTemp
Features considered: ['HeartRate']
Accuracy: 0.5164835164835165
F1 Score: 0.35180761267717797

Leaving out features: Age, SystolicBP, DiastolicBP, BS, HeartRate
Features considered: ['BodyTemp']
Accuracy: 0.5274725274725275
F1 Score: 0.4124444656619515

Leaving out features: Age, SystolicBP, DiastolicBP, BodyTemp, HeartRate
Features considered: ['BS']
Accuracy: 0.6373626373626373
F1 Score: 0.5287713576745834

Leaving out features: Age, SystolicBP, BS, BodyTemp, HeartRate
Features considered: ['DiastolicBP']
Accuracy: 0.5164835164835165
F1 Score: 0.35180761267717797

Leaving out features: Age, DiastolicBP, BS, BodyTemp, HeartRate
Features considered: ['SystolicBP']
Accuracy: 0.5164835164835165
F1 Score: 0.35180761267717797

Leaving out features: SystolicBP, DiastolicBP, BS, BodyTemp, HeartRate
Features considered: ['Age']
Accuracy: 0.5164835164835165
F1 Score: 0.35180761267717797


Summary: Here, only considering Blood sugar gives the highest accuracy.

#### Overall Summary

1. The highest summary obtained here is 0.6483516483516484.
2. When we checked while dropping features by backward elimination, we found out that only two features are affecting the result (target) very much rather than other features. They are: Blood Sugar and Body Temperature, because while dropping 4 features out of 6, only these 2 provided the highest accuracy than the others. This is proved in the other evaluations as well.
3. Therefore, we can come to a conclusion that Blood Sugar and Body temperature are the two features that are affecting the result according to the SVM while take 20% test set and 80% training set.