# ML: Breast Cancer

## Part 0: Pre-requisites 

### Imports

In [None]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV

### Function to plot the learning curve 

In [None]:
def plot_learning_curves(model, x_train, y_train, x_val, y_val):
    train_sizes, train_scores, val_scores = learning_curve(model, x_train, y_train, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10))

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    val_scores_std = np.std(val_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.title("Learning Curves")
    plt.xlabel("Training Examples")
    plt.ylabel("Score")
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1, color="g")

    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, val_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()

### Function to split the data 

In [None]:
def train_val_test_split(x, y):
    # Splitting into training (80%) and temporary set (20%)
    x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=3)
    # Splitting the temporary set into validation (50%) and test (50%)
    x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, shuffle=True, random_state=3)
    return x_train, y_train, x_val, y_val, x_test, y_test

## Part 1: Data 


### Load the data 

In [None]:
data = pd.read_csv("data.csv", encoding="utf-8")
data.head()

### Clean the data

#### Remove missing and duplicate values from dataset

In [None]:
# Check and remove missing values from the dataset
data.isna().sum()
data.dropna(axis=0, inplace=True)
data.reset_index(inplace=True)

In [None]:
# Check and remove duplicate values from the dataset
data.duplicated().sum()
data.drop_duplicates(inplace=True)

#### Split the data into train, validation, and test sets 

Given the dataset size (around 1500) we can go with either approach (spliting the data manualy or applying cross-validation)

Split the dataset into train (80%), validation (10%) and test (10%) sets

In [9]:
y = data["cancer_type"]
x = data.drop(columns = "cancer_type")

x_train, y_train, x_val, y_val, x_test, y_test = train_val_test_split(x, y)

## Part 2: Dimensionality Reduction 

## Part 3: Models

### Logistic Regression

#### Hyperparameter Tuning 

### Random Forest

#### Hyperparameter Tuning 

### SVM

#### Hyperparameter Tuning 

### Gradient Boosting 

#### Hyperparameter Tuning 

### Neural Network

#### Hyperparameter Tuning

## Part 3: Feature Selection (Check if it should be done before or after model training)


## Part 4: Choosing the best model 

## Part 5: Testing the best model on the test data