In [3]:
import pandas as pd

# Load the uploaded data to inspect its structure and content
file_path = 'DataSource.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset and its summary
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             878 non-null    int64  
 1   age            878 non-null    int64  
 2   Pedicle        878 non-null    int64  
 3   Breast Wieght  878 non-null    int64  
 4   SNN            878 non-null    int64  
 5   PMH            878 non-null    int64  
 6   Smoker         876 non-null    float64
 7   Minor          876 non-null    object 
 8   Major          876 non-null    object 
 9   BMI            878 non-null    int64  
 10  Complication   878 non-null    int64  
dtypes: float64(1), int64(8), object(2)
memory usage: 75.6+ KB


(   ID  age  Pedicle  Breast Wieght  SNN  PMH  Smoker Minor Major  BMI  \
 0   1    2        4              1    1    0     0.0     0     0    1   
 1   2    2        1              2    1    1     1.0     1     0    1   
 2   3    1        1              2    2    0     0.0     1     0    2   
 3   4    1        1              2    2    2     0.0     1     0    3   
 4   5    1        1              2    1    0     0.0     0     0    2   
 
    Complication  
 0             0  
 1             4  
 2             3  
 3             1  
 4             0  ,
 None)

In [5]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Drop the 'Complication' column as it's the result column
data_features = data.drop(columns=['Complication'])

# Handle missing values: Fill numerical column 'Smoker' with its mean
data_features['Smoker'].fillna(data_features['Smoker'].mean(), inplace=True)

# Fill categorical columns 'Minor' and 'Major' with their mode
data_features['Minor'].fillna(data_features['Minor'].mode()[0], inplace=True)
data_features['Major'].fillna(data_features['Major'].mode()[0], inplace=True)

# Encode categorical columns 'Minor' and 'Major'
label_encoder = LabelEncoder()
data_features['Minor'] = label_encoder.fit_transform(data_features['Minor'])
data_features['Major'] = label_encoder.fit_transform(data_features['Major'])

# Normalize the data (excluding the 'ID' column, which is just an identifier)
scaler = StandardScaler()
columns_to_scale = data_features.drop(columns=['ID']).columns
data_features[columns_to_scale] = scaler.fit_transform(data_features[columns_to_scale])

# Display the preprocessed data
data_features.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_features['Smoker'].fillna(data_features['Smoker'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_features['Minor'].fillna(data_features['Minor'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work b

Unnamed: 0,ID,age,Pedicle,Breast Wieght,SNN,PMH,Smoker,Minor,Major,BMI
0,1,0.799065,3.137604,-1.349841,-1.363345,-0.402925,-0.153029,-0.705996,-0.130613,-1.304547
1,2,0.799065,-0.47448,-0.076844,-1.363345,0.215551,6.549635,1.38814,-0.130613,-1.304547
2,3,-1.009128,-0.47448,-0.076844,-0.057983,-0.402925,-0.153029,1.38814,-0.130613,0.09912
3,4,-1.009128,-0.47448,-0.076844,-0.057983,0.834026,-0.153029,1.38814,-0.130613,1.502787
4,5,-1.009128,-0.47448,-0.076844,-1.363345,-0.402925,-0.153029,-0.705996,-0.130613,0.09912


In [7]:
from sklearn.cluster import KMeans

# Set the number of clusters to 9 and fit the K-means model
kmeans = KMeans(n_clusters=9, random_state=42)
data_features['Cluster'] = kmeans.fit_predict(data_features.drop(columns=['ID']))

# Add the cluster assignments to the dataset and display the cluster counts
cluster_counts = data_features['Cluster'].value_counts()

data_features[['ID', 'Cluster']].head(), cluster_counts

(   ID  Cluster
 0   1        7
 1   2        5
 2   3        3
 3   4        3
 4   5        4,
 Cluster
 4    181
 1    175
 3    165
 8    101
 2     99
 0     63
 7     56
 5     20
 6     18
 Name: count, dtype: int64)

In [12]:
def predict_clusters(new_data, model, scaler, label_encoder_minor, label_encoder_major):
    """
    Preprocesses and predicts the cluster for new data points.
    
    Parameters:
        new_data (DataFrame): The new data to be clustered.
        model (KMeans): The trained k-means model.
        scaler (StandardScaler): The scaler used to normalize data.
        label_encoder_minor (LabelEncoder): Encoder for 'Minor' column.
        label_encoder_major (LabelEncoder): Encoder for 'Major' column.
    
    Returns:
        DataFrame: New data with predicted clusters.
    """
    # Handle missing values
    new_data['Smoker'].fillna(data_features['Smoker'].mean(), inplace=True)
    new_data['Minor'].fillna(data_features['Minor'].mode()[0], inplace=True)
    new_data['Major'].fillna(data_features['Major'].mode()[0], inplace=True)
    
    # Encode categorical columns
    new_data['Minor'] = label_encoder_minor.transform(new_data['Minor'])
    new_data['Major'] = label_encoder_major.transform(new_data['Major'])
    
    # Normalize numerical columns (excluding 'ID')
    columns_to_scale = data_features.drop(columns=['ID', 'Cluster']).columns  # Use the original columns from training
    new_data[columns_to_scale] = scaler.transform(new_data[columns_to_scale])
    
    # Predict clusters
    new_data['Complication'] = model.predict(new_data[columns_to_scale])  # Match column names precisely
    return new_data

# Example: Predicting clusters for the first 5 rows of the original dataset as test data
test_data = data.iloc[:5].drop(columns=['Complication'])
predicted_clusters = predict_clusters(test_data, kmeans, scaler, label_encoder, label_encoder)
print(predicted_clusters)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_data['Smoker'].fillna(data_features['Smoker'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_data['Minor'].fillna(data_features['Minor'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Cluster
