In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
data = pd.read_csv('data.csv')

# Drop irrelevant columns
data = data.drop(['id', 'Unnamed: 32'], axis=1)

# Encode the target variable
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])  # M=1, B=0

# Separate features and target variable
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Display results
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)


Accuracy: 0.9649122807017544

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97        71
           1       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114


Confusion Matrix:
 [[70  1]
 [ 3 40]]


In [7]:
def predict_new_data(model, new_data):
    """
    Function to make predictions on new data.
    
    Parameters:
    model (sklearn model): The trained model.
    new_data (pd.DataFrame): The new data to predict, with the same feature columns as the training data.
    
    Returns:
    numpy.ndarray: The prediction(s) for the new data.
    """
    return model.predict(new_data)

In [9]:
import pandas as pd

# Define your single row of new data as a dictionary
new_sample = {
    'radius_mean': 20.57,
    'texture_mean': 17.77,
    'perimeter_mean': 132.9,
    'area_mean': 1326,
    'smoothness_mean': 0.08474,
    'compactness_mean': 0.07864,
    'concavity_mean': 0.0869,
    'concave points_mean': 0.07017,
    'symmetry_mean': 0.1812,
    'fractal_dimension_mean': 0.05667,
    'radius_se': 0.5435,
    'texture_se': 0.7339,
    'perimeter_se': 3.398,
    'area_se': 74.08,
    'smoothness_se': 0.005225,
    'compactness_se': 0.01308,
    'concavity_se': 0.0186,
    'concave points_se': 0.0134,
    'symmetry_se': 0.01389,
    'fractal_dimension_se': 0.003532,
    'radius_worst': 24.99,
    'texture_worst': 23.41,
    'perimeter_worst': 158.8,
    'area_worst': 1956,
    'smoothness_worst': 0.1238,
    'compactness_worst': 0.1866,
    'concavity_worst': 0.2416,
    'concave points_worst': 0.186,
    'symmetry_worst': 0.275,
    'fractal_dimension_worst': 0.08902
}

# Convert the dictionary to a DataFrame
new_data_df = pd.DataFrame([new_sample])

# Predict the diagnosis for the new data by passing both model and new_data_df
new_data_prediction = predict_new_data(model, new_data_df)
print("Prediction for New Data:", new_data_prediction[0])


Prediction for New Data: 1
