<a href="https://colab.research.google.com/github/akashbilgi/DMT/blob/main/Part2_from_scratch_DMT_SVM_wisconsindata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle


In [2]:
print("data read")
data = pd.read_csv('/content/drive/MyDrive/data.csv')

data read


In [3]:
data.drop(data.columns[[-1, 0]], axis=1, inplace=True)

In [4]:
# convert categorical labels to numbers
diag_map = {'M': 1.0, 'B': -1.0}
data['diagnosis'] = data['diagnosis'].map(diag_map)

In [5]:
Y = data.loc[:, 'diagnosis']
X = data.iloc[:, 1:]

In [6]:
def remove_correlated_features(data):
    # Define a correlation threshold above which features are considered correlated
    threshold = 0.9
    
    # Calculate the correlation matrix
    corr_matrix = data.corr()
    
    # Create an array to mark which columns to drop
    drop_columns = np.full(corr_matrix.shape[0], False, dtype=bool)
    
    # Iterate over the columns and compare their correlations to other columns
    for i in range(corr_matrix.shape[0]):
        for j in range(i + 1, corr_matrix.shape[0]):
            # If two columns are highly correlated, mark one of them to be dropped
            if corr_matrix.iloc[i, j] >= threshold:
                drop_columns[j] = True
                
    # Get the names of the columns to be dropped
    columns_to_drop = data.columns[drop_columns]
    
    # Drop the correlated columns from the data
    data.drop(columns_to_drop, axis=1, inplace=True)
    
    # Return the names of the columns that were dropped
    return columns_to_drop

In [7]:
def remove_less_significant_features(X, Y):
    significance_level = 0.05
    ols_regression = None
    dropped_columns = np.array([])
    for itr in range(0, len(X.columns)):
        # fit the Ordinary Least Squares (OLS) regression model and get the p-values
        ols_regression = sm.OLS(Y, X).fit()
        max_p_value_column = ols_regression.pvalues.idxmax()
        max_p_value = ols_regression.pvalues.max()
        if max_p_value > significance_level:
            # drop the column with the highest p-value above the threshold
            X.drop(max_p_value_column, axis='columns', inplace=True)
            dropped_columns = np.append(dropped_columns, [max_p_value_column])
        else:
            # stop if no more columns with p-value above the threshold
            break
    # print the summary of the OLS regression model
    ols_regression.summary()
    return dropped_columns

In [8]:
remove_correlated_features(X)
remove_less_significant_features(X, Y)

array(['smoothness_mean', 'compactness_worst', 'compactness_mean',
       'radius_mean', 'texture_se', 'symmetry_se', 'smoothness_se',
       'concavity_worst'], dtype='<U32')

In [9]:
X_normalized = MinMaxScaler().fit_transform(X.values)
X = pd.DataFrame(X_normalized)

In [10]:
X.insert(loc=len(X.columns), column='intercept', value=1)

In [11]:
# data set is split into train and test
X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)

In [12]:
def svm(features, labels,cost_threshold = 0.01):
    # Initialize weights to zeros
    weights = np.zeros(features.shape[1])
    pre_cost = float("inf")
    nth = 0
    for epoch in range(1, epochs):
        # Shuffle to prevent repeating update cycles
        shuffled_features, shuffled_labels = shuffle(features, labels)
        for index, feature in enumerate(shuffled_features):
            # Calculate gradient of loss with respect to weights
            gradient = calculate_loss_gradient(weights, feature, shuffled_labels[index])

            # Update weights
            weights = weights - (learn_rate * gradient)

        # test if converging on nth sqr
        if epoch == 2 ** nth or epoch == epochs - 1:
            # Calculate cost
            cost = compute_loss(weights, features, labels)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            if abs(pre_cost - cost) < cost_threshold * pre_cost:
                return weights
            pre_cost = cost
            nth += 1
    return weights

In [13]:
def calculate_loss_gradient(weights, features_batch, labels_batch):
    # If only one example is passed (e.g. in case of stochastic gradient descent)
    if type(labels_batch) == np.float64:
        labels_batch = np.array([labels_batch])
        features_batch = np.array([features_batch])  # Gives multidimensional array

    distances = 1 - (labels_batch * np.dot(features_batch, weights))
    gradient = np.zeros(len(weights))

    for index, distance in enumerate(distances):
        if max(0, distance) == 0:
            gradient_increment = weights
        else:
            gradient_increment = weights - (regularization_param * labels_batch[index] * features_batch[index])
        gradient += gradient_increment

    gradient = gradient / len(labels_batch)  # Average
    return gradient

In [14]:
def compute_loss(weights, features, labels):
    num_samples = features.shape[0]
    distances = 1 - labels * (np.dot(features, weights))
    distances[distances < 0] = 0
    hinge_loss = regularization_param * (np.sum(distances) / num_samples)
    regularization_loss = 1 / 2 * np.dot(weights, weights)
    cost = regularization_loss + hinge_loss
    return cost

In [15]:
epochs = 5000
regularization_param = 10000
learn_rate = 0.000001

In [16]:
y_train.to_numpy().shape

(455,)

In [17]:
    print("training started...")
    W = svm(X_train.to_numpy(), y_train.to_numpy())
    print("training finished.")
    print("weights are: {}".format(W))

training started...
Epoch is: 1 and Cost is: 7260.28533467402
Epoch is: 2 and Cost is: 6552.066011071646
Epoch is: 4 and Cost is: 5436.905374402511
Epoch is: 8 and Cost is: 3872.7222082210196
Epoch is: 16 and Cost is: 2652.612091643618
Epoch is: 32 and Cost is: 1973.1549749831236
Epoch is: 64 and Cost is: 1600.9729727115853
Epoch is: 128 and Cost is: 1345.9894922340384
Epoch is: 256 and Cost is: 1161.0563891462016
Epoch is: 512 and Cost is: 1074.157750332031
Epoch is: 1024 and Cost is: 1046.8632548356736
Epoch is: 2048 and Cost is: 1040.8533265765
training finished.
weights are: [ 3.53675368 11.02953306 -2.29249249 -7.92214238 10.14832679 -1.29482236
 -6.44501479  2.23911226 -3.88772306  3.2394927   4.96826469  4.81821958
 -4.7239342 ]


In [18]:
print("testing the model...")
y_train_predicted = np.array([])
for i in range(X_train.shape[0]):
    yp = np.sign(np.dot(X_train.to_numpy()[i], W))
    y_train_predicted = np.append(y_train_predicted, yp)

y_test_predicted = np.array([])
for i in range(X_test.shape[0]):
    yp = np.sign(np.dot(X_test.to_numpy()[i], W))
    y_test_predicted = np.append(y_test_predicted, yp)


testing the model...


In [19]:
print("Accuracy score on the test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
print("Recall score on the test dataset: {}".format(recall_score(y_test, y_test_predicted)))
print("Precision score on the test dataset: {}".format(precision_score(y_test, y_test_predicted)))


Accuracy score on the test dataset: 0.9912280701754386
Recall score on the test dataset: 0.9767441860465116
Precision score on the test dataset: 1.0
