### Note:- As all the data cleaning part and its statistical evaluation is already done in previous weeks, so I won't be doing this here.

In [20]:
# Load pandas library.
import pandas as pd
import numpy as np

In [10]:
# Attempt to load cancer dataset.
dataframe = pd.read_csv("dat/cancer-dataset.csv")

In [11]:
# Create a new dataframe to preserve the original dataset, 
# this might come handy for later use.
df = pd.DataFrame(dataframe)

In [12]:
# Remove column 'id' as it will not contribute in classification, 
# as each row has a unique identifier.
df =  df.loc[:, df.columns != 'id']

In [13]:
# import label encoder from sklearn package.
from sklearn.preprocessing import LabelEncoder
# Create an object of Label Encoder.
labelencoder_Y=LabelEncoder()

# View structure of the the dataset.
print(df.head())


  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0         M        17.99         10.38          122.80     1001.0   
1         M        20.57         17.77          132.90     1326.0   
2         M        19.69         21.25          130.00     1203.0   
3         M        11.42         20.38           77.58      386.1   
4         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
0         0.2419  ...         25.38          17.33 

In [14]:
# As we can see our target varaible 'Diagnosis' at column index one. 
# Use Label Encoder to encode String or Text value into numerical values.
# because many algorithm does not work on text data.
df.iloc[:,0]=labelencoder_Y.fit_transform(df.iloc[:,0].values)

In [15]:
# Again see, if values of diagnosis column is labelled successfully.
print(df.head())

   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0          1        17.99         10.38          122.80     1001.0   
1          1        20.57         17.77          132.90     1326.0   
2          1        19.69         21.25          130.00     1203.0   
3          1        11.42         20.38           77.58      386.1   
4          1        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
0         0.2419  ...         25.38          

In [26]:
# Attempt to plot a correlation matrix.
corr_matrix = df.corr(method="pearson")
print(corr_matrix)

                diagnosis  radius_mean  texture_mean  perimeter_mean  \
diagnosis        1.000000     0.730029      0.415185        0.742636   
radius_mean      0.730029     1.000000      0.323782        0.997855   
texture_mean     0.415185     0.323782      1.000000        0.329533   
perimeter_mean   0.742636     0.997855      0.329533        1.000000   
area_mean        0.708984     0.987357      0.321086        0.986507   

                area_mean  smoothness_mean  compactness_mean  concavity_mean  \
diagnosis        0.708984         0.358560          0.596534        0.696360   
radius_mean      0.987357         0.170581          0.506124        0.676764   
texture_mean     0.321086        -0.023389          0.236702        0.302418   
perimeter_mean   0.986507         0.207278          0.556936        0.716136   
area_mean        1.000000         0.177028          0.498502        0.685983   

                concave points_mean  symmetry_mean  ...  radius_worst  \
diagnosis    

In [23]:
# Drop columns which have strong correlation between them, it will
# increase the performance of the model by decreasing its complexity.
# This will also avoid the common model 'Overfitting' problem.
absolute_corr_matrix = corr_matrix.abs()

# Create an upper triangle.
mask = np.triu(np.ones_like(absolute_corr_matrix, dtype=bool))
tri_df = absolute_corr_matrix.mask(mask)

# Find the columns to drop.
to_drop = [c for c in tri_df.columns if any(tri_df[c] > 0.95)]
print(to_drop)

['radius_mean', 'perimeter_mean', 'area_mean', 'radius_se', 'radius_worst', 'perimeter_worst']


In [24]:
# Create a new reduced dataframe.
reduced_df = df.drop(to_drop, axis=1)

In [27]:
print(reduced_df.head())

   diagnosis  texture_mean  smoothness_mean  compactness_mean  concavity_mean  \
0          1         10.38          0.11840           0.27760          0.3001   
1          1         17.77          0.08474           0.07864          0.0869   
2          1         21.25          0.10960           0.15990          0.1974   
3          1         20.38          0.14250           0.28390          0.2414   
4          1         14.34          0.10030           0.13280          0.1980   

   concave points_mean  symmetry_mean  fractal_dimension_mean  texture_se  \
0              0.14710         0.2419                 0.07871      0.9053   
1              0.07017         0.1812                 0.05667      0.7339   
2              0.12790         0.2069                 0.05999      0.7869   
3              0.10520         0.2597                 0.09744      1.1560   
4              0.10430         0.1809                 0.05883      0.7813   

   perimeter_se  ...  symmetry_se  fractal_dimensi

In [64]:
# Divide dataset into two dataframes.
# X contains independent variables.
X=reduced_df.iloc[:,1:24].values 
# Y is the dependent variable.
Y=reduced_df.iloc[:,0].values


In [65]:
from sklearn.model_selection import train_test_split

# Attempt to split dataset into training and test dataset.
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size=0.25, random_state=0)

In [66]:
# Standardize features by removing the mean and scaling to unit variance
# z = (x - u) / s
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

In [67]:
# Create function to train a logistic regression model on the specified training dataset.
def logreg (X_train, Y_train):
    """
        Trains a logisitic regretion model.
        
        Parameters
        ------
        X_train : The training dataset of independent variables.
        Y_train : The training dataset of dependent variable.
    """
    from sklearn.linear_model import LogisticRegression
    # Create a logistic regression model.
    log=LogisticRegression (random_state=0)
    # Train the model on the specified training dataset.
    log.fit(X_train, Y_train)
    return log

In [68]:
# Create and train a logistic regression model.
logreg = logreg(X_train, Y_train)

In [69]:
def findAccuracy(Y_test, prediction):
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(Y_test, prediction)
    # In our case 'Malignant' is a positive class.

    # The diagonis was 'Malignat' and predicted as 'Malignant'.
    TP=cm[0][0]
    # The diagonis was 'Not Malignat' and predicted as 'Not Malignant'.
    TN=cm[1][1]
    # The diagonis was 'Malignat' and predicted as 'Not Malignant'.
    FN=cm[1][0]
    # The diagonis was 'Not Malignat' and predicted as 'Malignant'.
    FP=cm[0][1]
    return (TP+TN)/(TP+TN+FN+FP)

In [70]:
# Predict the dependent variable using the previously trained logistic regression model.
logreg_prediction = logreg.predict(X_test)

In [71]:
print("Testing accuracy of logistic regression model=", findAccuracy(Y_test, logreg_prediction))

Testing accuracy of logistic regression model= 0.951048951048951


In [84]:
# Create function to train a Decision tree classified model on the specified training dataset.
def dectree (X_train, Y_train):
    """
        Trains a Decision tree classifier model.
        
        Parameters
        ------
        X_train : The training dataset of independent variables.
        Y_train : The training dataset of dependent variable.
    """
    from sklearn import tree
    # Create a Decision Tree Classification model.
    clf = tree.DecisionTreeClassifier(criterion="entropy", random_state=0)
    # Train the model on the specified training dataset.
    clf.fit(X_train, Y_train)
    return clf

In [85]:
# Create and train a Decision Tree Classifier model.
dectree = dectree(X_train, Y_train)

In [86]:
# Predict the dependent variable using the previously trained Decision Tree Classification model.
dectree_prediction = dectree.predict(X_test)

In [87]:
print("Testing accuracy of Decision Tree Classification model=", findAccuracy(Y_test, dectree_prediction))

Testing accuracy of Decision Tree Classification model= 0.9300699300699301


In [80]:
# Create function to train a Random forest classification model on the specified training dataset.
def randfor (X_train, Y_train):
    """
        Trains a Random forest classification model.
        
        Parameters
        ------
        X_train : The training dataset of independent variables.
        Y_train : The training dataset of dependent variable.
    """
    from sklearn.ensemble import RandomForestClassifier
    # Create a Random Forest Classification model.
    clf = RandomForestClassifier(random_state=0)
    # Train the model on the specified training dataset.
    clf.fit(X_train, Y_train)
    return clf

In [81]:
# Create and train a Random Forest Classifier model.
randfor = randfor(X_train, Y_train)

In [82]:
# Predict the dependent variable using the previously trained Random Forest Classification model.
randfor_prediction = randfor.predict(X_test)

In [83]:
print("Testing accuracy of Random Forest model=", findAccuracy(Y_test, randfor_prediction))

Testing accuracy of Random Forest model= 0.965034965034965
