# Machine Learning Analysis & Hyperparameter Tuning
Ovaj notebook uključuje kompletnu obradu oba dataseta: Wisconsin Breast Cancer i Heart Disease.

## 1. Wisconsin Breast Cancer Dataset - Učitavanje i osnovna analiza

In [None]:

import pandas as pd
wisc_df = pd.read_csv("wisc_bc_data.csv")
wisc_df.head()


In [None]:
# Provjera nedostajućih vrijednosti
wisc_df.isnull().sum().sum()

In [None]:
# Distribucija ciljne varijable
wisc_df['diagnosis'].value_counts()

## 2. Priprema podataka

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:

# Label Encoding ciljne varijable
wisc_df['diagnosis'] = wisc_df['diagnosis'].map({'B':0, 'M':1})
X_wisc = wisc_df.drop(['id', 'diagnosis'], axis=1)
y_wisc = wisc_df['diagnosis']

# Skaliranje
scaler = StandardScaler()
X_wisc_scaled = scaler.fit_transform(X_wisc)

# Podjela na trening/test skup
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_wisc_scaled, y_wisc, 
                                                            test_size=0.2, random_state=42)


## 3. Modeliranje i Hyperparameter Tuning - Random Forest

In [None]:

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5)
grid_rf.fit(X_train_w, y_train_w)
print("Najbolji parametri:", grid_rf.best_params_)
y_pred_rf = grid_rf.predict(X_test_w)
print(classification_report(y_test_w, y_pred_rf))


## 4. Modeliranje i Hyperparameter Tuning - SVM

In [None]:

param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

grid_svm = GridSearchCV(SVC(), param_grid_svm, cv=5)
grid_svm.fit(X_train_w, y_train_w)
print("Najbolji parametri:", grid_svm.best_params_)
y_pred_svm = grid_svm.predict(X_test_w)
print(classification_report(y_test_w, y_pred_svm))


In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Prvo podaci
X = wisc_df.drop(['id', 'diagnosis'], axis=1)
y = wisc_df['diagnosis'].map({'B':0, 'M':1})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline
pipe = Pipeline([
('scaler', StandardScaler()),
('classifier', SVC()) # samo placeholder
])

# Parametri za oba modela
param_grid = [
{
'classifier': [SVC()],
'classifier__C': [0.1, 1, 10],
'classifier__kernel': ['linear', 'rbf']
},
{
'classifier': [RandomForestClassifier(random_state=42)],
'classifier__n_estimators': [50, 100],
'classifier__max_depth': [None, 5, 10]
}
]

# GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)

# Rezultati
print("Najbolji model:", grid.best_estimator_)
print("Najbolji parametri:", grid.best_params_)
print("Najbolji score:", grid.best_score_)

## 5. Modeliranje i Hyperparameter Tuning - KNN

In [None]:

param_grid_knn = {
    'n_neighbors': [3,5,7],
    'weights': ['uniform', 'distance']
}

grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5)
grid_knn.fit(X_train_w, y_train_w)
print("Najbolji parametri:", grid_knn.best_params_)
y_pred_knn = grid_knn.predict(X_test_w)
print(classification_report(y_test_w, y_pred_knn))


## 6. Heart Disease Dataset - Učitavanje i osnovna analiza

In [None]:
import pandas as pd

heart_df = pd.read_csv("heart_disease_uci.csv")
heart_df.head()


In [None]:
# Nedostajuće vrijednosti
heart_df.isnull().sum()

## 7. Čišćenje i priprema podataka

In [None]:

# Drop kolone sa mnogo praznih vrijednosti
heart_df = heart_df.drop(['ca', 'thal', 'slope'], axis=1)
from sklearn.impute import SimpleImputer
# Imputacija numeričkih vrijednosti
imputer = SimpleImputer(strategy='median')
num_cols = heart_df.select_dtypes(include=['float64']).columns
heart_df[num_cols] = imputer.fit_transform(heart_df[num_cols])

# Label Encoding kategorijskih varijabli
cat_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang']
for col in cat_cols:
    heart_df[col] = LabelEncoder().fit_transform(heart_df[col])

X_heart = heart_df.drop(['id', 'num'], axis=1)
y_heart = heart_df['num'].apply(lambda x: 1 if x > 0 else 0)  # Binarizacija

# Skaliranje
scaler = StandardScaler()
X_heart_scaled = scaler.fit_transform(X_heart)

# Podjela na trening/test skup
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_heart_scaled, y_heart, test_size=0.2, random_state=42)


## 8. Random Forest - Heart Disease

In [None]:

param_grid_rf_h = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10]
}

grid_rf_h = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf_h, cv=5)
grid_rf_h.fit(X_train_h, y_train_h)
print("Najbolji parametri:", grid_rf_h.best_params_)
y_pred_rf_h = grid_rf_h.predict(X_test_h)
print(classification_report(y_test_h, y_pred_rf_h))


## 9. Logistic Regression - Heart Disease

In [None]:

param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l2']
}

grid_lr = GridSearchCV(LogisticRegression(max_iter=500), param_grid_lr, cv=5)
grid_lr.fit(X_train_h, y_train_h)
print("Najbolji parametri:", grid_lr.best_params_)
y_pred_lr = grid_lr.predict(X_test_h)
print(classification_report(y_test_h, y_pred_lr))
