# Feature Engineering

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv("../data/dataset1.csv")
df.head(-1)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.38,17.33,184.60,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.99,23.41,158.80,1956.0,0.1238,0.1866,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.57,25.53,152.50,1709.0,0.1444,0.4245,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.91,26.50,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.54,16.67,152.20,1575.0,0.1374,0.2050,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563,926125,M,20.92,25.09,143.00,1347.0,0.10990,0.22360,0.31740,0.14740,...,24.29,29.41,179.10,1819.0,0.1407,0.4186,0.6599,0.2542,0.2929,0.09873
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.45,26.40,166.10,2027.0,0.1410,0.2113,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.69,38.25,155.00,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.98,34.12,126.70,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.07820


## Task 1: Mean Imputation

In [3]:
def mean_imputation(data):
    """
    Impute missing values in the input DataFrame by filling in the mean value of each column.

    Parameters:
    data (pandas.DataFrame): Input data with missing values.

    Returns:
    pandas.DataFrame: A copy of the input data with missing 
                        values imputed with the mean value of each column.
    """
    data1 = data.copy()
    mean = np.mean(data1, axis=0)
    for i in range(2, data1.shape[1]):
        data1.iloc[:, i].fillna(mean[i-1], inplace=True)
    return data1

In [4]:
df1 = mean_imputation(df)

In [5]:
print("old values:", df.iloc[533][2], df.iloc[548][4])
print("new replaced values:", df1.iloc[533][2], df1.iloc[548][4])

old values: nan nan
new replaced values: 14.116125000000011 92.02346830985917


## Task 2: Normalization

In [6]:
def normalize(data):
    """
    Normalize the input data by subtracting the mean and dividing by 
    the standard deviation of each feature to center the input data around
    zero and scaling it to have unit variance.

    Parameters:
    data (pandas.DataFrame): Input data to be normalized.

    Returns:
    Tuple: A tuple containing the normalized features, the original labels, 
            the mean of each feature, and the standard deviation of each feature.
    """
    y = data['diagnosis'].map({'M': 1, 'B': -1})
    X = data.drop(['diagnosis'], axis=1)
    mean = np.mean(X, axis=0)
    stddev = np.std(X, axis=0)
    X= (X - mean) / stddev
    return X, y, mean, stddev

In [7]:
df2, y, mean, stddev = normalize(df)
df2.head()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,-0.236405,1.102422,-2.073335,1.268389,0.984375,1.568466,3.283515,2.650837,2.532475,2.217515,...,1.88669,-1.359293,2.303601,1.996898,1.307686,2.616665,2.107374,2.296076,2.750622,1.937015
1,-0.236403,1.836635,-0.353632,1.684639,1.908708,-0.826962,-0.487072,-0.025417,0.548144,0.001392,...,1.805927,-0.369203,1.535126,1.88631,-0.375612,-0.430444,-0.147968,1.087084,-0.24389,0.28119
2,0.431741,1.586206,0.456187,1.565122,1.558884,0.94221,1.052926,1.361666,2.037231,0.939685,...,1.51187,-0.023974,1.347475,1.452735,0.527407,1.082932,0.85334,1.955,1.152255,0.201391
3,0.432121,-0.76726,0.253732,-0.595257,-0.764464,3.283553,3.402909,1.913989,1.451707,2.867383,...,-0.281464,0.133984,-0.249939,-0.550662,3.394275,3.893397,1.987485,2.175786,6.046041,4.93501
4,0.432201,1.756953,-1.151816,1.775308,1.826229,0.280372,0.53934,1.369198,1.428493,-0.00956,...,1.298575,-1.46677,1.338539,1.217516,0.220556,-0.313395,0.611645,0.729259,-0.868353,-0.3971


## Train-Test Split

In [18]:
def train_test_split(df, train_size=0.67, shuffle=True, random_state=42):
    """
    Splits a DataFrame into training and test sets.

    Args:
        df (pandas.DataFrame): The DataFrame to split.
        train_size (float, optional): The fraction of the data to include in the training set. Defaults to 0.67.
        shuffle (bool, optional): Whether to shuffle the data before splitting. Defaults to True.
        random_state (int, optional): The random seed to use for shuffling the data. Defaults to 42.

    Returns:
        tuple: A tuple containing the following four elements:
            X_train (pandas.DataFrame): The feature matrix of the training set.
            X_test (pandas.DataFrame): The feature matrix of the test set.
            y_train (pandas.Series): The target series of the training set.
            y_test (pandas.Series): The target series of the test set.

    Raises:
        ValueError: If train_size is not between 0 and 1.
    
    This function splits the normalized data into training and test sets according to the 
    specified train_size. If shuffle is True, the data is randomly shuffled before splitting. 
    If random_state is given, it is used to seed the random number generator for reproducibility.
    """
    if (shuffle):
        df = df.sample(frac=1, random_state=random_state)
    y = df['diagnosis'].map({'M': 1, 'B': -1})
    X = df.drop(['diagnosis'], axis=1)
    split_idx = int(len(df) * train_size)
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    return X_train, X_test, y_train, y_test

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df1)

In [20]:
X_train

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
204,87930,12.47,18.60,81.09,481.9,0.09965,0.10580,0.080050,0.038210,0.1925,...,14.97,24.64,96.05,677.9,0.14260,0.23780,0.26710,0.10150,0.3014,0.08750
70,859575,18.94,21.31,123.60,1130.0,0.09009,0.10290,0.108000,0.079510,0.1582,...,24.86,26.58,165.90,1866.0,0.11930,0.23360,0.26870,0.17890,0.2551,0.06589
131,8670,15.46,19.48,101.70,748.9,0.10920,0.12230,0.146600,0.080870,0.1931,...,19.26,26.00,124.90,1156.0,0.15460,0.23940,0.37910,0.15140,0.2837,0.08019
431,907915,12.40,17.68,81.47,467.8,0.10540,0.13160,0.077410,0.027990,0.1811,...,12.88,22.91,89.61,515.8,0.14500,0.26290,0.24030,0.07370,0.2556,0.09359
540,921385,11.54,14.44,74.65,402.9,0.09984,0.11200,0.067370,0.025940,0.1818,...,12.26,19.68,78.78,457.8,0.13450,0.21180,0.17970,0.06918,0.2329,0.08134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,909220,14.04,15.98,89.78,611.2,0.08458,0.05895,0.035340,0.029440,0.1714,...,15.66,21.58,101.20,750.0,0.11950,0.12520,0.11170,0.07453,0.2725,0.07234
258,887181,15.66,23.20,110.20,773.5,0.11090,0.31140,0.317600,0.137700,0.2495,...,19.85,31.64,143.70,1226.0,0.15040,0.51720,0.61810,0.24620,0.3277,0.10190
232,88203002,11.22,33.81,70.79,386.8,0.07780,0.03574,0.004967,0.006434,0.1845,...,12.36,41.78,78.44,470.9,0.09994,0.06885,0.02318,0.03002,0.2911,0.07307
115,864685,11.93,21.53,76.53,438.6,0.09768,0.07849,0.033280,0.020080,0.1688,...,13.67,26.15,87.54,583.0,0.15000,0.23990,0.15030,0.07247,0.2438,0.08541


In [11]:
y_train

204   -1
70     1
131    1
431   -1
540   -1
      ..
437   -1
258    1
232   -1
115   -1
120   -1
Name: diagnosis, Length: 381, dtype: int64

In [12]:
X_test

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
429,0.483528,-0.397657,-0.376903,-0.455534,-0.436823,-1.238299,-1.120430,-0.941091,-0.788500,-1.287391,...,-0.507184,-0.768171,-0.547798,-0.518046,-1.120822,-1.006469,-1.141216,-1.195078,-1.190291,-1.313123
376,-0.235933,-1.010039,0.216499,-0.902262,-0.900412,-0.400681,1.168530,1.747317,0.270601,1.374147,...,-1.122220,-0.465281,-0.915951,-0.931108,-0.792053,0.684709,1.586534,0.485634,-0.491410,1.997972
224,-0.172593,-0.241001,-0.528162,-0.308274,-0.308554,-0.847600,-1.030979,-0.670718,-0.628319,-1.152306,...,-0.233835,-0.338263,-0.250833,-0.303514,-0.209036,-0.783502,-0.450483,-0.271433,-0.638628,-0.427578
492,-0.235831,1.109089,0.295619,1.088009,1.001440,0.266139,0.465430,0.352732,0.740309,1.111279,...,1.089422,0.062333,1.076424,0.957652,-0.064377,-0.137184,-0.086661,0.522178,0.566618,-0.426470
412,-0.235899,-1.344144,0.556251,-1.331253,-1.098077,-1.186348,-0.830283,-0.647977,-1.129548,-1.962815,...,-1.305488,0.376621,-1.210830,-1.020613,-1.041918,-0.417085,-0.411125,-1.354653,-0.848940,0.449654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,-0.236266,-1.489122,-1.082004,-1.370852,-1.168611,0.104593,0.924055,-0.035993,-0.521016,0.329977,...,-1.353531,-1.629614,-1.331463,-1.049803,-0.511503,-0.067845,-0.619435,-1.016318,-1.046309,1.355149
106,-0.236239,-0.705272,-0.223317,-0.695191,-0.689379,1.269571,-0.050051,-0.228973,-0.362899,-0.038768,...,-0.648001,0.583433,-0.647878,-0.632521,1.597003,0.074651,0.071251,0.109537,-0.153294,0.389251
270,-0.171812,0.049525,-0.574704,-0.071092,-0.063392,-2.282296,-1.470464,-1.026149,-1.100607,-1.108494,...,-0.281464,-0.818652,-0.381891,-0.346069,-2.047074,-1.297121,-1.122161,-1.237560,-0.716282,-1.260478
435,-0.235875,-0.038772,0.076875,-0.037267,-0.157532,0.686015,0.169787,0.297451,0.405245,-0.520693,...,0.159621,0.834212,0.197742,-0.021282,1.268234,0.652266,0.645302,1.036837,0.450138,1.194443


In [13]:
y_test

429   -1
376   -1
224   -1
492    1
412   -1
      ..
71    -1
106   -1
270   -1
435    1
102   -1
Name: diagnosis, Length: 188, dtype: int64

We can see that the data has been split in ratio 2:1 and shuffled.

## Evaluate Function

In [14]:
def evaluate(y_true, y_pred):
    """
    Computes and prints the confusion matrix, accuracy, precision, and recall
    for a given set of true and predicted target values.

    Args:
        y_true (numpy array): An array of true target values.
        y_pred (numpy array): An array of predicted target values.

    Returns:
        None

    Raises:
        None
    """
    confusion_matrix = {"true_positive": 0, "true_negative": 0, "false_positive": 0, "false_negative": 0}
    
    for true_value, pred_value in zip(y_true, y_pred):
        if true_value == 1 and pred_value == 1:
            confusion_matrix["true_positive"] += 1
        elif true_value == -1 and pred_value == -1:
            confusion_matrix["true_negative"] += 1
        elif true_value == -1 and pred_value == 1:
            confusion_matrix["false_positive"] += 1
        elif true_value == 1 and pred_value == -1:
            confusion_matrix["false_negative"] += 1

    accuracy = (confusion_matrix["true_positive"] + confusion_matrix["true_negative"]) / len(y_true)
    precision = confusion_matrix["true_positive"] / (confusion_matrix["true_positive"] + confusion_matrix["false_positive"])
    recall = confusion_matrix["true_positive"] / (confusion_matrix["true_positive"] + confusion_matrix["false_negative"])

    print("Confusion Matrix: ", confusion_matrix)
    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)