# Library

In [1]:
# data processing
import pandas as pd

# linear algebra
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.datasets import make_hastie_10_2
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Data Loading

BC_data = pd.read_csv("C:/Users/yazhini/Bcancer_data.csv")

# Data Exploration

In [None]:
# Dimension of Dataset
BC_data.shape

In [None]:
# Display top 5 rows from Dataset
BC_data.head()

# Data Cleaning

In [None]:
# Display information about the dataset and check if there are any null values in each column
BC_data.isnull().sum()

In [None]:
#Drop the column 'unnamed'
BC_data = BC_data.dropna(axis = 1)

In [None]:
# Display the dimension of Dataset
BC_data.shape

**Split the column names based on Mean, Standard error or worst**

In [None]:
# Mean Column
BC_data_mean = BC_data.loc[:,["diagnosis", "radius_mean", "texture_mean","perimeter_mean",
                             "area_mean", "smoothness_mean","compactness_mean", "concavity_mean",
                             "symmetry_mean","fractal_dimension_mean","concave points_mean"]]

In [None]:
#Display the BC_data_mean data
BC_data_mean

In [None]:
# Standard Error(se) Column
BC_data_se = BC_data.loc[:,["diagnosis", "radius_se", "texture_se",
            "perimeter_se", "area_se", "smoothness_se", "compactness_se", 
            "concavity_se", "concave points_se", "symmetry_se", 
            "fractal_dimension_se" ]]

In [None]:
#Display the BC_data_se data
BC_data_se

In [None]:
# Column name ends with 'worst'
BC_data_worst = BC_data.loc[:,["diagnosis", "radius_worst", 
               "texture_worst","perimeter_worst", "area_worst",                                
               "smoothness_worst", "compactness_worst", "concavity_worst",
               "concave points_worst", "symmetry_worst", 
               "fractal_dimension_worst" ]]

In [None]:
#Display the BC_data_worst data
BC_data_worst

# Data Visualization/Data Exploration

In [None]:
# Display number of Malignant and Benign in diagnosis column
BC_data["diagnosis"].value_counts()

In [None]:
#Visualise the count of values in diagnosis column
sns.countplot(BC_data["diagnosis"], label = 'count')

In [None]:
# Feature names in three list
list_worst=["diagnosis", "radius_worst", "texture_worst",
           "perimeter_worst", "area_worst", "smoothness_worst",               
           "compactness_worst", "concavity_worst","concave points_worst", 
            "symmetry_worst","fractal_dimension_worst" ]
list_se=["diagnosis", "radius_se", "texture_se",
        "perimeter_se", "area_se", "smoothness_se", "compactness_se", 
        "concavity_se", "concave points_se", "symmetry_se", 
        "fractal_dimension_se"]
list_mean = ["diagnosis", "radius_mean", "texture_mean","perimeter_mean",
            "area_mean", "smoothness_mean","compactness_mean", 
             "concavity_mean","symmetry_mean","fractal_dimension_mean",
             "concave points_mean"]

In [None]:
#Visualise each feature in 'worst' group by diagnostic result
for i in range(1,11): 
    
    grid = sns.FacetGrid(BC_data_worst, hue='diagnosis')
    grid.map(plt.hist,list_worst[i], 
                alpha=.5, bins=10)
    grid.add_legend()
plt.show()

In [None]:
#Visualise each feature in 'mean' group by diagnostic result
for i in range(1,11):    
    grid = sns.FacetGrid(BC_data_mean, hue='diagnosis')
    grid.map(plt.hist,list_mean[i], 
                alpha=.5, bins=10)
    grid.add_legend()

In [None]:
#Visualise each feature in 'se' group by diagnostic result
for i in range(1,11):    
    grid = sns.FacetGrid(BC_data_se, hue='diagnosis')
    grid.map(plt.hist,list_se[i], 
                alpha=.5, bins=10)
    grid.add_legend()

# Correlation

In [None]:
#Compute the correlation among the variables except id, target
correlation = BC_data.iloc[:,2:32].corr()
correlation

In [None]:
plt.figure(figsize = (20,15))
sns.heatmap(correlation, annot = True,fmt = '.0%') 

In [None]:
# Dimension of correlation dataset
size=correlation.shape[0]
size

## Remove the variables with more than 0.9 correlation


In [None]:
#all features except id and diagnosis 
BC_Data1=BC_data.iloc[:,2:32]

In [None]:
# Removing features which has .9 or more correlation
total_column = np.full((size,), True, dtype = bool)
for x in range(size):
    for y in range(x+1, size):
        if correlation.iloc[x,y]>= 0.9:
            print(x,y)
            print(correlation.iloc[x,y])
            if total_column[y]:
                total_column[y]= False
filtered_columns = BC_Data1.columns[total_column]
final_data = BC_data[filtered_columns]

In [None]:
# Dispaly correlation for 20 variables which we get after reduction
corr_after_reduction =final_data.corr()
corr_after_reduction

In [None]:
# Name of 20 features which we get for final analysis
filtered_columns

In [None]:
# Display dimension of data after reduction
corr_after_reduction.shape

In [None]:
# visualize the correlation with heat map for final data which has 20 feature
plt.figure(figsize = (15,15))
sns.heatmap(corr_after_reduction, annot = True,fmt = '.0%') 

In [None]:
# Data frame for target column
target = pd.DataFrame()
target['diagnosis'] = BC_data.iloc[:,1]
target['diagnosis']

In [None]:
# Visualize all 20 features with respect to diagnosis
fig = plt.figure(figsize = (20, 25))
y = 0
for x in final_data.columns:
    plt.subplot(7, 3, y+1)
    y += 1
    sns.distplot(final_data[x][target['diagnosis']=='M'], color='blue', label = 'Benign')
    sns.distplot(final_data[x][target['diagnosis']=="B"], color='red', label = 'Malignant')
    plt.legend(loc='best')
fig.suptitle('Feature Analysis with Diagnosis')
fig.tight_layout()
fig.subplots_adjust(top=0.95)
plt.show()

# Data Transformation

In [None]:
#Encode the categorical data values
lblencoder_d= LabelEncoder()
BC_data.iloc[:,1] = lblencoder_d.fit_transform(BC_data.iloc[:,1].values)

In [None]:
# Split the data set into independent(predictors) and dependent (target)
X=final_data.values
Y=BC_data.iloc[:,1]

# Standardisation
Scale all numerical features in X using sklearn's StandardScaler class

In [None]:
#Perform standardisation
datascaled = StandardScaler()
X= datascaled.fit_transform(X)

# Principal Component Analysis
PCA is used for dimentionality reduction and to know the variance of the data set

In [None]:
pca = PCA()
out = pca.fit_transform(X)
out.shape

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.explained_variance_ratio_.cumsum()

In [None]:
# Choose the number of components based on cumulative variance
plt.figure(figsize=(15,15))
plt.plot(range(1,21),pca.explained_variance_ratio_.cumsum(), marker='o',linestyle='--')
plt.title('Explanied variance by Componentes')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative')
plt.axhline(y=.95, xmin=0, xmax=20,color='red')
plt.grid()
plt.show()

We have chosen 10 principal components to explain more than 0.95 of the variance.

In [None]:
pca1 = PCA(n_components = 10)

In [None]:
pca1.fit(X)

In [None]:
final = pca1.transform(X)
final

In [None]:
# new data frame with 10 component and assign name for each component
pca_df = pd.DataFrame( data =  final,
                    columns = ['pc1', 'pc2','pc3', 'pc4','pc5','pc6','pc7','pc8','pc9','pc10'])

In [None]:
# attach the diagnosis column to PCA new data frame
pca_df['Response'] = BC_data['diagnosis']

In [None]:
# Display dimension 
pca_df.shape

In [None]:
# Display top rows
pca_df.head()

**Scatter plot between Pc1 and Pc2 component since it explains 55% of information**

In [None]:
# Visualize component1 and component2
sns.scatterplot('pc1','pc2',hue='Response',data=pca_df,palette=['g','r'])

**To visualize which variables are the most influential on the first 2 components**

# Important Feature

In [None]:
def biplot(score,coeff,labels=None):
    s1 = score[:,0]
    s2 = score[:,1]
    n=coeff.shape[0]
    scalex = 1.0/(s1.max()- s1.min())
    scaley = 1.0/(s2.max()- s2.min())
    plt.scatter(s1 * scalex,s2 * scaley, c = pca_df.iloc[:,-1])
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color='r',alpha=.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color='g', ha='center', va='center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color='g', ha='center', va='center')
 
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.grid()
 
# test
 
#data=np.random.rand(100,10)*10
#pca=PCA(scores)
 
biplot(final[:,0:2],np.transpose(pca1.components_[0:2, :]))
plt.show()

Now, the importance of each feature is reflected by the magnitude of the corresponding values in the eigenvectors (higher magnitude - higher importance)

In [None]:
#Let's check the PC explained variance.
pca1.explained_variance_ratio_

PC1 explains 42% of the information and PC2 13% of information. Together,  PC1 and PC2 only explain 55% of information.

Now, let's find the most important features.

In [None]:
# Each variable's variance in each component
abs(pca1.components_ )

Here, pca1.components_ has shape (row=n_components, column=n_features). Thus, by looking at the PC1 (First Principal Component) which is the first row: (1.32037515e-01 8.65416182e-02 2.26589267e-01 3.19548976e-01
3.01093179e-01 2.27708080e-01 2.23447406e-01 1.69612240e-01 
5.36124163e-02 9.74162025e-02 2.83931331e-01 2.52462094e-01
2.50535394e-01 1.21786697e-01 2.29604985e-01 2.05937616e-01
2.88112018e-01 2.89154562e-01 1.92610186e-01 2.59331142e-01) we can conclude that feature 1, 3 and 4 (or Var 1, 3 and 4 in the biplot) are the most important.

In [None]:
#Assign total dimension of PCA data frame to the new variable
n_pcs= pca1.components_.shape[0]
print('Dimension of pca data frame : ',n_pcs)

In [None]:
most_important_index = [np.abs(pca1.components_[i]).argmax() for i in range(n_pcs)]
print('Index of most important feature are :  ', most_important_index)

In [None]:
# Assign name of each feature to list
feature_names = ['radius_mean', 'texture_mean', 'smoothness_mean', 'compactness_mean',
       'concavity_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'smoothness_se', 'compactness_se',
       'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'smoothness_worst', 'compactness_worst',
       'concavity_worst', 'symmetry_worst', 'fractal_dimension_worst']


In [None]:
most_important_names = [feature_names[most_important_index[i]] for i in range(n_pcs)]

In [None]:
important_dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}


In [None]:
importance_df = pd.DataFrame(important_dic.items())
print('Most Important Feature in Each Component')
importance_df

In [None]:
#Split the column into dependent nd independent for pca data frame
X = pca_df.iloc[:,0:10]

#Y will tell us if a patient has cancer or not
Y = pca_df.iloc[:,-1].values
pca_df

**Split the data into 70% of training data and 30% of test data for both dependent and independent variables**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X ,Y, test_size = 0.3,shuffle=False)

In [None]:
print('dimension of X_train : ',X_train.shape)

In [None]:
print('dimension of X_test : ',X_test.shape)

In [None]:
print('dimension of Y_train : ',Y_train.shape)

In [None]:
print('dimension of Y_test : ',Y_test.shape)

# Build Machine Learning Models

In [None]:
#create a function for the models
def models(X_train, Y_train):
    list_accuracy =[]
    
    #Mode0 --> Logistic regression
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(random_state = 0)
    log.fit(X_train, Y_train)
    list_accuracy.append(log.score(X_train, Y_train))
    
    #Model1 --> Decision Tree
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    tree.fit(X_train, Y_train)
    list_accuracy.append(tree.score(X_train, Y_train))
             
    #Model2 --> Random Forest Classifier
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    forest.fit(X_train, Y_train)
    list_accuracy.append(forest.score(X_train, Y_train))
    
    #Model3 --> kNeighbors
    from sklearn.neighbors import KNeighborsClassifier
    KNN = KNeighborsClassifier(n_neighbors=5, p=2, weights='distance')
    KNN.fit(X_train, Y_train)
    list_accuracy.append(KNN.score(X_train, Y_train))
    
    #Model4 --> SVM
    from sklearn.svm import SVC
    svc = SVC(gamma=0.025, C=3)
    svc.fit(X_train,Y_train)
    list_accuracy.append(svc.score(X_train, Y_train))
    
    #Print the result of the models
    print('1) Logistic Regression Training Accuracy:', log.score(X_train, Y_train))
    print('2) DecisionTreeClaassifier Training Accuracy:', tree.score(X_train, Y_train))           
    print('3) Random Forest Classifier Training Accuracy:', forest.score(X_train, Y_train))
    print('4) kNeighbors Training Accuracy:', KNN.score(X_train, Y_train))
    print('5) Support Vector Machine Training Accuracy:', svc.score(X_train, Y_train)) 
    return log, tree, forest, KNN, svc,list_accuracy

In [None]:
model = models(X_train, Y_train)

In [None]:
# Assign all model names to list
model_names =['Logistic Regression','DecisionTreeClaassifier','Random Forest Classifier','kNeighbors Training','Support Vector Machine']

# Conclusions

In [None]:
#test model accuracy on test data on confusion matrix (method 2)
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

for i in range( len(model) -1):
    print('*'*30,model_names[i],'*'*30)
    print('Confusion Matrix and Statistics')
    cm = confusion_matrix(Y_test,model[i].predict(X_test))
    #y_score = model[i].decision_function(X_test)
    print(cm)
    TP = cm[0][0]
    TN = cm[1][1]
    FN = cm[1][0]
    FP = cm[0][1]

    accuracy = (TP + TN)/(TP + TN + FN + FP)
    #test_acc.append(accuracy)    
    #print(model_names[i],'Tesing Accuracy =', accuracy)    
    r = classification_report(Y_test, model[i].predict(X_test))
    print (r)
    
#     sensitivity1 = TP/(TP+FP)
#     print('Sensitivity :     ', sensitivity1 )

    specificity1 = TN/(FN+TN)
    print('Specificity :     ', specificity1)    
    
    print('Testing Accuracy   ',accuracy_score(Y_test, model[i].predict(X_test)))
    print()
    print()

# Thank You