<a href="https://colab.research.google.com/github/arielwendichansky/DI_Bootcamp/blob/master/Week8/Day1/Exercises_XP/intro_to_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification

from sklearn.datasets import load_breast_cancer
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.svm import SVC

# Extracting info from Kaggle

In [None]:
! pip install -k kaggle
from google.colab import files
files.upload()

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [34]:
! kaggle datasets download -d utkarshx27/heart-disease-diagnosis-dataset

Dataset URL: https://www.kaggle.com/datasets/utkarshx27/heart-disease-diagnosis-dataset
License(s): CC0-1.0
Downloading heart-disease-diagnosis-dataset.zip to /content
  0% 0.00/3.29k [00:00<?, ?B/s]
100% 3.29k/3.29k [00:00<00:00, 5.09MB/s]


In [35]:
! unzip 'heart-disease-diagnosis-dataset.zip'

Archive:  heart-disease-diagnosis-dataset.zip
  inflating: dataset_heart.csv       


In [37]:
df = pd.read_csv('dataset_heart.csv')

# Data inspection


In [38]:
df.head()

Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral,fasting blood sugar,resting electrocardiographic results,max heart rate,exercise induced angina,oldpeak,ST segment,major vessels,thal,heart disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,2
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,1
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,2
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,1
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,1


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   age                                   270 non-null    int64  
 1   sex                                   270 non-null    int64  
 2   chest pain type                       270 non-null    int64  
 3   resting blood pressure                270 non-null    int64  
 4   serum cholestoral                     270 non-null    int64  
 5   fasting blood sugar                   270 non-null    int64  
 6   resting electrocardiographic results  270 non-null    int64  
 7   max heart rate                        270 non-null    int64  
 8   exercise induced angina               270 non-null    int64  
 9   oldpeak                               270 non-null    float64
 10  ST segment                            270 non-null    int64  
 11  major vessels      

In [41]:
df.describe()

Unnamed: 0,age,sex,chest pain type,resting blood pressure,serum cholestoral,fasting blood sugar,resting electrocardiographic results,max heart rate,exercise induced angina,oldpeak,ST segment,major vessels,thal,heart disease
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296,1.444444
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659,0.497827
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,1.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0,1.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0,1.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,2.0


In [42]:
df.duplicated().sum()

0

# Train Test split

In [77]:
cdata = df.copy()
y=cdata['heart disease']
X = cdata.drop(columns='heart disease')

In [78]:
y

0      2
1      1
2      2
3      1
4      1
      ..
265    1
266    1
267    1
268    1
269    2
Name: heart disease, Length: 270, dtype: int64

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Here 80% of our data is trained and 20% is tested (test_size=0.2).

In [80]:
print("Dataset length:", len(df))
print("Train set length:", len(X_train))
print("Test set length:", len(X_test))

Dataset length: 270
Train set length: 216
Test set length: 54


# Logistic regression and testing accuracy

In [81]:
scaler = StandardScaler() # From Sklearn we imported StandardScaler
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(solver='sag', max_iter=2000)

# Training the logistic regression model

# we use .ravel() to transform the y_train data into a 1D array because LogisticRegression()
# expects the target variable (in this case, y_train) to be a 1D array rather than a column vector or a 2D array.
model.fit(X_train_scaled, y_train.values.ravel())


In [82]:
# Measuring the accuracy from the model
train_accuracy = model.score(X_train_scaled, y_train)
test_accuracy = model.score(X_test_scaled, y_test)


print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Training Accuracy: 0.8379629629629629
Test Accuracy: 0.9074074074074074


#  Logistic Regression With Grid Search

In [83]:
# Creating the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}

# Instantiating logistic regression classifier
logreg = LogisticRegression(max_iter=2000)

# Instantiating the GridSearchCV object
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# Fit the GridSearchCV object to the data
logreg_cv.fit(X, y)

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))
print("Best score is {}".format(logreg_cv.best_score_))

Tuned Logistic Regression Parameters: {'C': 0.4393970560760795}
Best score is 0.8407407407407408


# SVM Without Grid Search

**Parameters of SVM model: **
- C (default = 1.0) :
- kernel (default = 'rbf') : 'linear' , 'poly' , ' rbf' , 'sigmoid'
- gamma (default = 'scale') :
- degree (default = 3 (relevant only for 'poly' kernel)) :
- coef0 (default = 0.0 (relevant only for 'poly' and 'sigmoid' kernels)) :
- shrinking (default = True) :
- probability (default = False) :
- class_weight (default = None (all classes are supposed to have weight one)) :
- decision_function_shape (default = 'ovr' that means: one-vs-rest) :

In [65]:
#Build the model
svm = SVC(kernel="rbf", gamma=0.5)
# Trained the model
svm.fit(X_train_scaled, y_train.values.ravel())

#Measuring the model accuracy
train_accuracy = svm.score(X_train_scaled, y_train)
test_accuracy = svm.score(X_test_scaled, y_test)


print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Training Accuracy: 0.9953703703703703
Test Accuracy: 0.7222222222222222


# SVM With Grid Search

In [68]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],  # Different kernel functions to try
    'gamma': [0.1, 0.5, 1.0],  # Different gamma values
    'C': [0.001, 0.01, 0.1,0.44, 1, 10, 100]  # Different values of C
}

# Initialize the SVM classifier
svm = SVC()

# Instantiate the GridSearchCV object
svm_cv = GridSearchCV(svm, param_grid, cv=5)

# Fit the GridSearchCV object to the data
svm_cv.fit(X_train_scaled, y_train.values.ravel())

# Print the best parameters and best score
print("Tuned SVM Parameters: {}".format(svm_cv.best_params_))
print("Best score is {}".format(svm_cv.best_score_))

Tuned SVM Parameters: {'C': 0.01, 'gamma': 0.1, 'kernel': 'linear'}
Best score is 0.8381606765327696


# XGBoost Without Grid Search

In [91]:
from sklearn.metrics import accuracy_score
import xgboost as xgb


In [85]:
y_train = y_train.replace(1, 0)
y_train = y_train.replace(2, 1)


In [86]:
#create DMatrix objects, which are the internal data structures used by XGBoost.
xgb_train = xgb.DMatrix(X_train, y_train, enable_categorical=True)
xgb_test = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [87]:
n=50 #Number of boosting rounds or trees to train.
params = {
	'objective': 'binary:logistic',
	'max_depth': 3,
	'learning_rate': 0.1,
}

model = xgb.train(params=params,dtrain=xgb_train,num_boost_round=n)


In [90]:
# Make predictions on the test data
y_pred = model.predict(xgb_test)

# Convert the predicted probabilities to binary predictions
# For binary classification, round the probabilities to the nearest integer (0 or 1)
y_pred_binary = [round(pred_prob) for pred_prob in y_pred]

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred_binary)

print("Accuracy:", accuracy)

Accuracy: 0.018518518518518517


# XGBoost With Grid Search

In [92]:
# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'gamma': [0.1, 0.2, 0.3],
}

# Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier()

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters:", best_params)
print("Best score:", best_score)

# Make predictions on the test data using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Best parameters: {'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
Best score: 0.8199788583509513
Accuracy: 0.037037037037037035
