# Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset

In [None]:
df=pd.read_csv(r"https://raw.githubusercontent.com/azaz6216/dataset/refs/heads/main/data%20-%20Copy.csv")

# EDA (Exploratory Data Analysis)

### a. Understanding the Dataset

- Head of the dataset
- Shape of the data set
- Types of columns
- Information about data set
- Summary of the data set

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe()

### b. Cleaning the Dataset

- Dropping duplicate values
- Checking NULL values
- Checking for 0 value and replacing it

In [None]:
df = df.drop_duplicates()


In [None]:
df.isnull().sum()

### Checking for specified value occurance in feature

In [None]:
x=(df==0).sum()
x

### Fill the zero

In [None]:
for col in df.columns:
    if df[col].dtype != 'object' and col != 'diagnosis':
        df[col] = df[col].replace(0, np.NaN)
        mean_val = df[col].mean(skipna=True)
        df[col] = df[col].replace(np.NaN, mean_val)

In [None]:
y=(df==0).sum()
y

In [None]:
v=df.isnull().sum()
v

### Drop unneccessary columns

In [None]:
df=df.drop(["id"],axis=1)
df

In [None]:
df=df.dropna(axis=1)
df

### Convert object type to int type(diagnosis)

In [None]:
df['diagnosis'].value_counts()

In [None]:
sns.countplot(x="diagnosis",data=df)

In [None]:
data=df

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
lb=LabelEncoder()

In [None]:
data['diagnosis']=lb.fit_transform(data["diagnosis"])

In [None]:
data

In [None]:
data.info()

In [None]:
data['diagnosis'].value_counts()

In [None]:
sns.countplot(x="diagnosis",data=data)

# Data Visualization

In [None]:
data.hist(bins=31,figsize=(20,20))
plt.show

In [None]:
plt.figure(figsize=(20,24))
sns.set_style(style='whitegrid')

plt.subplot(6,5,1)
sns.boxplot(x='radius_mean',data=data)
plt.subplot(6,5,2)
sns.boxplot(x='texture_mean',data=data)
plt.subplot(6,5,3)
sns.boxplot(x='perimeter_mean',data=data)
plt.subplot(6,5,4)
sns.boxplot(x='area_mean',data=data)
plt.subplot(6,5,5)
sns.boxplot(x='smoothness_mean',data=data)

plt.subplot(6,5,6)
sns.boxplot(x='compactness_mean',data=data)
plt.subplot(6,5,7)
sns.boxplot(x='concavity_mean',data=data)
plt.subplot(6,5,8)
sns.boxplot(x='concave points_mean',data=data)
plt.subplot(6,5,9)
sns.boxplot(x='symmetry_mean',data=data)
plt.subplot(6,5,10)
sns.boxplot(x='fractal_dimension_mean',data=data)

plt.subplot(6,5,11)
sns.boxplot(x='radius_se',data=data)
plt.subplot(6,5,12)
sns.boxplot(x='texture_se',data=data)
plt.subplot(6,5,13)
sns.boxplot(x='perimeter_se',data=data)
plt.subplot(6,5,14)
sns.boxplot(x='area_se',data=data)
plt.subplot(6,5,15)
sns.boxplot(x='smoothness_se',data=data)

plt.subplot(6,5,16)
sns.boxplot(x='compactness_se',data=data)
plt.subplot(6,5,17)
sns.boxplot(x='concavity_se',data=data)
plt.subplot(6,5,18)
sns.boxplot(x='concave points_se',data=data)
plt.subplot(6,5,19)
sns.boxplot(x='symmetry_se',data=data)
plt.subplot(6,5,20)
sns.boxplot(x='fractal_dimension_se',data=data)

plt.subplot(6,5,21)
sns.boxplot(x='radius_worst',data=data)
plt.subplot(6,5,22)
sns.boxplot(x='texture_worst',data=data)
plt.subplot(6,5,23)
sns.boxplot(x='perimeter_worst',data=data)
plt.subplot(6,5,24)
sns.boxplot(x='area_worst',data=data)
plt.subplot(6,5,25)
sns.boxplot(x='smoothness_worst',data=data)

plt.subplot(6,5,26)
sns.boxplot(x='compactness_worst',data=data)
plt.subplot(6,5,27)
sns.boxplot(x='concavity_worst',data=data)
plt.subplot(6,5,28)
sns.boxplot(x='concave points_worst',data=data)
plt.subplot(6,5,29)
sns.boxplot(x='symmetry_worst',data=data)
plt.subplot(6,5,30)
sns.boxplot(x='fractal_dimension_worst',data=data)

plt.show()


In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(data,figsize=(50,50),color='red')

# Feature Selection

Pearson's Correlation Coefficient : Helps you find out the relationship between two quantities. It gives you the measure of the strength of association between two variables. The value of Pearson's Correlation Coefficient can be between -1 to +1. 1 means that they are highly correlated and 0 means no correlation.

A heat map is a two-dimensional representation of information with the help of colors. Heat maps can help the user visualize simple or complex information.

In [None]:
data.corr()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.iloc[:,0:10].corr(),annot=True)
plt.show()

### Measure Importance of Feature using KBest

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = data.drop(['diagnosis'],axis = 1)
y = data['diagnosis']
bestfeatures = SelectKBest(score_func=chi2, k=30)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']
print(featureScores.nlargest(30,'Score'))


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(30).plot(kind='barh')
plt.show()

# Handling Outliers

### Outliers removal

In [None]:
from sklearn.preprocessing import QuantileTransformer
x=data.drop(['diagnosis'],axis=1)
quantile  = QuantileTransformer(n_quantiles=569)
w=quantile.fit_transform(x)
data_new=pd.DataFrame(w,columns=x.columns)
data_new['diagnosis'] = data['diagnosis'].values

data_new.head()

In [None]:
plt.figure(figsize=(20,24))
sns.set_style(style='whitegrid')

plt.subplot(6,5,1)
sns.boxplot(x='radius_mean',data=data_new)
plt.subplot(6,5,2)
sns.boxplot(x='texture_mean',data=data_new)
plt.subplot(6,5,3)
sns.boxplot(x='perimeter_mean',data=data_new)
plt.subplot(6,5,4)
sns.boxplot(x='area_mean',data=data_new)
plt.subplot(6,5,5)
sns.boxplot(x='smoothness_mean',data=data_new)

plt.subplot(6,5,6)
sns.boxplot(x='compactness_mean',data=data_new)
plt.subplot(6,5,7)
sns.boxplot(x='concavity_mean',data=data_new)
plt.subplot(6,5,8)
sns.boxplot(x='concave points_mean',data=data_new)
plt.subplot(6,5,9)
sns.boxplot(x='symmetry_mean',data=data_new)
plt.subplot(6,5,10)
sns.boxplot(x='fractal_dimension_mean',data=data_new)

plt.subplot(6,5,11)
sns.boxplot(x='radius_se',data=data_new)
plt.subplot(6,5,12)
sns.boxplot(x='texture_se',data=data_new)
plt.subplot(6,5,13)
sns.boxplot(x='perimeter_se',data=data_new)
plt.subplot(6,5,14)
sns.boxplot(x='area_se',data=data_new)
plt.subplot(6,5,15)
sns.boxplot(x='smoothness_se',data=data_new)

plt.subplot(6,5,16)
sns.boxplot(x='compactness_se',data=data_new)
plt.subplot(6,5,17)
sns.boxplot(x='concavity_se',data=data_new)
plt.subplot(6,5,18)
sns.boxplot(x='concave points_se',data=data_new)
plt.subplot(6,5,19)
sns.boxplot(x='symmetry_se',data=data_new)
plt.subplot(6,5,20)
sns.boxplot(x='fractal_dimension_se',data=data_new)

plt.subplot(6,5,21)
sns.boxplot(x='radius_worst',data=data_new)
plt.subplot(6,5,22)
sns.boxplot(x='texture_worst',data=data_new)
plt.subplot(6,5,23)
sns.boxplot(x='perimeter_worst',data=data_new)
plt.subplot(6,5,24)
sns.boxplot(x='area_worst',data=data_new)
plt.subplot(6,5,25)
sns.boxplot(x='smoothness_worst',data=data_new)

plt.subplot(6,5,26)
sns.boxplot(x='compactness_worst',data=data_new)
plt.subplot(6,5,27)
sns.boxplot(x='concavity_worst',data=data_new)
plt.subplot(6,5,28)
sns.boxplot(x='concave points_worst',data=data_new)
plt.subplot(6,5,29)
sns.boxplot(x='symmetry_worst',data=data_new)
plt.subplot(6,5,30)
sns.boxplot(x='fractal_dimension_worst',data=data_new)

plt.show()


In [None]:
sns.pairplot(data_new,hue="diagnosis")

#  Split Dataset for dependent and independent Features

In [None]:
X=data_new.drop(['diagnosis'],axis=1)
y=data_new['diagnosis']

In [None]:
X.head()

In [None]:
y.head()

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

## Standard Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
st=StandardScaler()
X_train=st.fit_transform(X_train)
X_test=st.transform(X_test)

# Cross Validation

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

models ={
    'Decision Tree':DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(),
    'SVM' : SVC()
}
k_fold = KFold(n_splits = 7, shuffle=True, random_state = 42)

for model_name, model in models.items():
    cross_score = cross_val_score(model, X, y, cv=k_fold)
    print(f"--------------------------------------------------------")
    print(f"{model_name}:\n")
    print(f"Cross Validation Score for each fold:{cross_score}\n")
    print(f"Average Cross Validation Score: {cross_score.mean()}\n")
    print(f"--------------------------------------------------------")

# Classification Algorithm

In [None]:
from sklearn.svm import SVC
sv=SVC(probability=True)
sv.fit(X_train,y_train)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(sv.predict(X_test),y_test))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV

In [None]:
knn= KNeighborsClassifier()
n_neighbors = list(range(15,25))
p=[1,2]
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
hyperparameters = dict(n_neighbors=n_neighbors, p=p,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=knn, param_grid=hyperparameters, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
best_model = grid_search.fit(X_train,y_train)

#Best Hyperparameters Value
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

In [None]:
knn_pred = best_model.predict(X_test)
print("Classification Report is:\n",classification_report(y_test,knn_pred))
print("\n F1:\n",f1_score(y_test,knn_pred))
print("\n Precision score is:\n",precision_score(y_test,knn_pred))
print("\n Recall score is:\n",recall_score(y_test,knn_pred))
print("\n Confusion Matrix:\n")
sns.heatmap(confusion_matrix(y_test,knn_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
lg=LogisticRegression()
lg.fit(X_train,y_train)

In [None]:
print(classification_report(lg.predict(X_test),y_test))

In [None]:
from sklearn.tree import DecisionTreeClassifier
tr=DecisionTreeClassifier(random_state=42)
tr.fit(X_train,y_train)

In [None]:
print(classification_report(tr.predict(X_test),y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

In [None]:
print(classification_report(rfc.predict(X_test),y_test))

# Hyperparameter Tuning

### Random forest

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print(f"Best parameters found: {grid_search.best_params_}")

best_rfc_model = grid_search.best_estimator_

y_pred_rfc = best_rfc_model.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(f"Accuracy: {accuracy_score(y_test, y_pred_rfc)}")

print(confusion_matrix(y_test, y_pred_rfc))

print(classification_report(y_test, y_pred_rfc))


### SVM

In [None]:
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [0.0001, 0.001, 0.01, 0.1, 1],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [3, 4, 5]
}
grid_search = GridSearchCV(estimator=sv, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print(f"Best parameters found: {grid_search.best_params_}")

best_svm_model = grid_search.best_estimator_

y_pred_svm = best_svm_model.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(f"Accuracy: {accuracy_score(y_test,y_pred_svm)}")

print(confusion_matrix(y_test, y_pred_svm))

print(classification_report(y_test, y_pred_svm))


### Decision Tree

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}

grid_search = GridSearchCV(estimator=tr, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print(f"Best parameters found: {grid_search.best_params_}")

best_dt_model = grid_search.best_estimator_

y_pred_dt = best_dt_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_dt)}")

print(confusion_matrix(y_test,y_pred_dt))

print(classification_report(y_test, y_pred_dt))

### Logistic Regression

In [None]:

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced'],
    'l1_ratio': [0, 0.5, 1]
}

grid_search = GridSearchCV(estimator=lg, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print(f"Best parameters found: {grid_search.best_params_}")

best_logreg_model = grid_search.best_estimator_

y_pred_lr = best_logreg_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_lr)}")
print(confusion_matrix(y_test,y_pred_lr))
print(classification_report(y_test, y_pred_lr))


### Bar chart for different algorithm

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
accuracy_scores = [
    accuracy_score(y_test, y_pred_svm),
    accuracy_score(y_test, y_pred_lr),
    accuracy_score(y_test, y_pred_dt),
    accuracy_score(y_test, knn_pred),
    accuracy_score(y_test, y_pred_rfc)
]

model_names = ['SVM','Logistic Regression', 'Decision Tree',  'KNN', 'Random Forest']

plt.figure(figsize=(10, 6))
bars = plt.bar(model_names, accuracy_scores, color=['blue', 'green', 'orange', 'red', 'purple'])


plt.title('Accuracy Scores of Different Models', fontsize=14)
plt.xlabel('Models', fontsize=12)
plt.ylabel('Accuracy Score', fontsize=12)

for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2.0,
        height,
        f'{height:.2%}',
        ha='center', va='bottom', fontsize=12
    )

plt.show()
