DECISION TREE


In [9]:
# IMPORTING LIBRARIES
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.impute import SimpleImputer

# IMPORTING DATASET
# Assuming the CSV file is named 'your_dataset.csv'
dataset = pd.read_csv("/content/data.csv")

# ENCODING CATEGORICAL VARIABLES
# Encode the target variable 'diagnosis'
le = LabelEncoder()
dataset['diagnosis'] = le.fit_transform(dataset['diagnosis'])

# Separating features and target variable
x = dataset.iloc[:, 2:].values  # Assuming your features start from the third column
y = dataset['diagnosis'].values

# HANDLE MISSING VALUES
imputer = SimpleImputer(strategy='mean')
x = imputer.fit_transform(x)

# SPLITTING THE DATASET
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10)

# FEATURE SCALING
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# CREATING A MODEL
classifier = DecisionTreeClassifier(criterion='entropy', random_state=10)
classifier.fit(x_train, y_train)

# PREDICTING THE RESULT
y_pred = classifier.predict(x_test)
print(y_pred)

# CONFUSION MATRIX
cm = confusion_matrix(y_test, y_pred)
print(cm)

# ACCURACY
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[1 0 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 1 0 1 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 1 0 1 1 1 1 1 0 1 1 0 0 1 1 0 1 0 1 0 1 1 0 1 0 0 0 1 0 0 0 0
 0 1 0 0 1 1 0 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 1 0 1 1 0 0 0 0 1 1 0 0 0
 0 1 1 1 0 1 1 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1]
[[82  9]
 [ 3 49]]
Accuracy: 0.916083916083916


SUPPORT VECTOR MACHINE


In [14]:
# IMPORTING LIBRARIES
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer

# IMPORTING DATASET
# Assuming the CSV file is named 'your_dataset.csv'
dataset = pd.read_csv("/content/data.csv")

# ENCODING CATEGORICAL VARIABLES
# Encode the target variable 'diagnosis'
le = LabelEncoder()
dataset['diagnosis'] = le.fit_transform(dataset['diagnosis'])

# Separating features and target variable
x = dataset.iloc[:, 2:].values  # Assuming your features start from the third column
y = dataset['diagnosis'].values

# HANDLE MISSING VALUES
imputer = SimpleImputer(strategy='mean')
x = imputer.fit_transform(x)

# SPLITTING THE DATASET
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10)

# FEATURE SCALING
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)


# SVM with Cross-Validation
from sklearn.model_selection import cross_val_score

svm_classifier = SVC(kernel='linear', random_state=10)
svm_cv_scores = cross_val_score(svm_classifier, x, y, cv=5)  # You can adjust the number of folds (cv)
print("SVM Cross-Validation Scores:", svm_cv_scores)
print("SVM Average Accuracy:", np.mean(svm_cv_scores))

svm_classifier.fit(x_train, y_train)
svm_y_pred = svm_classifier.predict(x_test)

# ACCURACY
svm_accuracy = accuracy_score(y_test, svm_y_pred)
print("SVM Accuracy:", svm_accuracy)

# CONFUSION MATRIX
print("Confusion Matrix:")
print(confusion_matrix(y_test, svm_y_pred))

# ACCURACY
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


SVM Cross-Validation Scores: [0.94736842 0.92982456 0.97368421 0.92105263 0.95575221]
SVM Average Accuracy: 0.9455364073901569
SVM Accuracy: 0.9370629370629371
Confusion Matrix:
[[85  6]
 [ 3 49]]
Accuracy: 0.916083916083916


K - NEAREST NEIGHBORS

In [12]:
# IMPORTING LIBRARIES
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer

# IMPORTING DATASET
# Assuming the CSV file is named 'your_dataset.csv'
dataset = pd.read_csv("/content/data.csv")

# ENCODING CATEGORICAL VARIABLES
# Encode the target variable 'diagnosis'
le = LabelEncoder()
dataset['diagnosis'] = le.fit_transform(dataset['diagnosis'])

# Separating features and target variable
x = dataset.iloc[:, 2:].values  # Assuming your features start from the third column
y = dataset['diagnosis'].values

# HANDLE MISSING VALUES
imputer = SimpleImputer(strategy='mean')
x = imputer.fit_transform(x)

# SPLITTING THE DATASET
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10)

# FEATURE SCALING
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# KNN
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(x_train, y_train)
knn_y_pred = knn_classifier.predict(x_test)

# ACCURACY
knn_accuracy = accuracy_score(y_test, knn_y_pred)
print("KNN Accuracy:", knn_accuracy)

# CONFUSION MATRIX
print("Confusion Matrix:")
print(confusion_matrix(y_test, knn_y_pred))

# ACCURACY
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


KNN Accuracy: 0.986013986013986
Confusion Matrix:
[[91  0]
 [ 2 50]]
Accuracy: 0.916083916083916


LOGISTIC REGRESSION

In [15]:
# IMPORTING LIBRARIES
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer

# IMPORTING DATASET
# Assuming the CSV file is named 'your_dataset.csv'
dataset = pd.read_csv("/content/data.csv")

# ENCODING CATEGORICAL VARIABLES
# Encode the target variable 'diagnosis'
le = LabelEncoder()
dataset['diagnosis'] = le.fit_transform(dataset['diagnosis'])

# Separating features and target variable
x = dataset.iloc[:, 2:].values  # Assuming your features start from the third column
y = dataset['diagnosis'].values

# HANDLE MISSING VALUES
imputer = SimpleImputer(strategy='mean')
x = imputer.fit_transform(x)

# SPLITTING THE DATASET
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10)

# FEATURE SCALING
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Logistic Regression
logreg_classifier = LogisticRegression(random_state=10)
logreg_classifier.fit(x_train, y_train)
logreg_y_pred = logreg_classifier.predict(x_test)

# ACCURACY
logreg_accuracy = accuracy_score(y_test, logreg_y_pred)
print("Logistic Regression Accuracy:", logreg_accuracy)

# CONFUSION MATRIX
print("Confusion Matrix:")
print(confusion_matrix(y_test, logreg_y_pred))

# ACCURACY
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Logistic Regression Accuracy: 0.958041958041958
Confusion Matrix:
[[87  4]
 [ 2 50]]
Accuracy: 0.916083916083916
