In [2]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the dataset
file_path = 'DataSource.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Step 2: Data Preprocessing
# Handle missing values
data['Smoker'].fillna(data['Smoker'].mean(), inplace=True)
data['Minor'].fillna(data['Minor'].mode()[0], inplace=True)
data['Major'].fillna(data['Major'].mode()[0], inplace=True)

# Encode categorical columns
label_encoder_minor = LabelEncoder()
label_encoder_major = LabelEncoder()
data['Minor'] = label_encoder_minor.fit_transform(data['Minor'])
data['Major'] = label_encoder_major.fit_transform(data['Major'])

# Separate features (X) and target (y)
X = data.drop(columns=['Complication', 'ID'])  # Exclude 'Complication' and 'ID'
y = data['Complication']

# Normalize the features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the Gaussian Mixture Model
n_classes = len(y.unique())  # Number of unique classes in 'Complication'
gmm = GaussianMixture(n_components=n_classes, random_state=42)
gmm.fit(X_train)

# Step 4: Predict and Evaluate
# Assign clusters to classes based on the highest posterior probability
y_train_pred = gmm.predict(X_train)
y_test_pred = gmm.predict(X_test)

# Evaluation metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
classification_report_output = classification_report(y_test, y_test_pred)

# Print the results
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report_output)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Smoker'].fillna(data['Smoker'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Minor'].fillna(data['Minor'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on wh

Train Accuracy: 7.83%
Test Accuracy: 6.25%
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.08      0.14       103
           1       0.03      0.67      0.06         3
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00         2
           4       0.50      0.02      0.04        45
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         7
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         2

    accuracy                           0.06       176
   macro avg       0.16      0.09      0.03       176
weighted avg       0.65      0.06      0.10       176



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the dataset
file_path = 'DataSource.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Step 2: Data Preprocessing
# Handle missing values
data['Smoker'].fillna(data['Smoker'].mean(), inplace=True)
data['Minor'].fillna(data['Minor'].mode()[0], inplace=True)
data['Major'].fillna(data['Major'].mode()[0], inplace=True)

# Encode categorical columns
label_encoder_minor = LabelEncoder()
label_encoder_major = LabelEncoder()
data['Minor'] = label_encoder_minor.fit_transform(data['Minor'])
data['Major'] = label_encoder_major.fit_transform(data['Major'])

# Separate features (X) and target (y)
X = data.drop(columns=['Complication', 'ID'])  # Exclude 'Complication' and 'ID'
y = data['Complication']

# Normalize the features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the Gaussian Mixture Model
n_classes = len(y.unique())  # Number of unique classes in 'Complication'
gmm = GaussianMixture(n_components=n_classes, random_state=42)
gmm.fit(X_train)

# Step 4: Predict and Evaluate
# Assign clusters to classes based on the highest posterior probability
y_train_pred = gmm.predict(X_train)
y_test_pred = gmm.predict(X_test)

# Evaluation metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
classification_report_output = classification_report(y_test, y_test_pred)

# Print the results
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report_output)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Smoker'].fillna(data['Smoker'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Minor'].fillna(data['Minor'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on wh

Train Accuracy: 7.83%
Test Accuracy: 6.25%
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.08      0.14       103
           1       0.03      0.67      0.06         3
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00         2
           4       0.50      0.02      0.04        45
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         7
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         2

    accuracy                           0.06       176
   macro avg       0.16      0.09      0.03       176
weighted avg       0.65      0.06      0.10       176



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the dataset
file_path = 'DataSource.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Step 2: Data Preprocessing
# Handle missing values
data['Smoker'].fillna(data['Smoker'].mean(), inplace=True)
data['Minor'].fillna(data['Minor'].mode()[0], inplace=True)
data['Major'].fillna(data['Major'].mode()[0], inplace=True)

# Encode categorical columns
label_encoder_minor = LabelEncoder()
label_encoder_major = LabelEncoder()
data['Minor'] = label_encoder_minor.fit_transform(data['Minor'])
data['Major'] = label_encoder_major.fit_transform(data['Major'])


data.drop(columns=["Minor", "Major"], inplace=True)

# Separate features (X) and target (y)
X = data.drop(columns=['Complication', 'ID'])  # Exclude 'Complication' and 'ID'
y = data['Complication']

# Normalize the features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the Gaussian Mixture Model
n_classes = len(y.unique())  # Number of unique classes in 'Complication'
gmm = GaussianMixture(n_components=n_classes, random_state=42)
gmm.fit(X_train)

# Step 4: Predict and Evaluate
# Assign clusters to classes based on the highest posterior probability
y_train_pred = gmm.predict(X_train)
y_test_pred = gmm.predict(X_test)

# Evaluation metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
classification_report_output = classification_report(y_test, y_test_pred)

# Print the results
print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report_output)


Train Accuracy: 10.54%
Test Accuracy: 10.23%
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.05      0.09       103
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00         2
           4       0.19      0.24      0.21        45
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         7
           7       0.12      0.50      0.19         4
           8       0.00      0.00      0.00         2

    accuracy                           0.10       176
   macro avg       0.13      0.09      0.06       176
weighted avg       0.54      0.10      0.11       176



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Smoker'].fillna(data['Smoker'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Minor'].fillna(data['Minor'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on wh