# SVM

In [1]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
# Accessing the dataset
url = 'https://raw.githubusercontent.com/alyshapm/coral-reef-bleaching/main/dataset/NOAA_Reef_Check__Bleaching_Data.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Bleaching,Ocean,Year,Depth,Storms,HumanImpact,Siltation,Dynamite,Poison,Sewage,Industrial,Commercial
0,No,Atlantic,2005,4.0,yes,high,often,none,none,high,none,none
1,No,Red Sea,2004,6.0,no,high,occasionally,none,none,low,none,none
2,No,Pacific,1998,3.0,no,low,never,none,none,none,low,none
3,No,Pacific,1998,10.0,no,low,never,none,none,none,low,none
4,No,Atlantic,1997,10.0,no,high,never,none,none,high,moderate,none


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9111 entries, 0 to 9110
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Bleaching    9111 non-null   object 
 1   Ocean        9111 non-null   object 
 2   Year         9111 non-null   int64  
 3   Depth        9111 non-null   float64
 4   Storms       9111 non-null   object 
 5   HumanImpact  9111 non-null   object 
 6   Siltation    9111 non-null   object 
 7   Dynamite     9111 non-null   object 
 8   Poison       9111 non-null   object 
 9   Sewage       9111 non-null   object 
 10  Industrial   9111 non-null   object 
 11  Commercial   9111 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 854.3+ KB


# Preparing the dataset
The datatypes of the variables are all objects, so it has to be changed to int or float.

This will be done by replacing words with numbers based on their severity.

Example: None = 0, Low = 1, Moderate = 2, High = 3

In [4]:
# Replacing object in Bleaching to int
df['Bleaching'] = df['Bleaching'].str.replace('No','0')
df['Bleaching'] = df['Bleaching'].str.replace('Yes','1')
df['Bleaching'] = df['Bleaching'].astype('int') 

In [5]:
# Replacing object in Storms to int
df['Storms'] = df['Storms'].str.replace('no','0')
df['Storms'] = df['Storms'].str.replace('yes','1')
df['Storms'] = df['Storms'].astype('int')

In [6]:
# Replacing object in HumanImpact to int
df['HumanImpact'] = df['HumanImpact'].str.replace('high','3')
df['HumanImpact'] = df['HumanImpact'].str.replace('moderate','2')
df['HumanImpact'] = df['HumanImpact'].str.replace('low','1')
df['HumanImpact'] = df['HumanImpact'].str.replace('none','0')
df['HumanImpact'] = df['HumanImpact'].astype('int')

In [7]:
# Replacing object in Siltation to int
df['Siltation'] = df['Siltation'].str.replace('always','3')
df['Siltation'] = df['Siltation'].str.replace('often','2')
df['Siltation'] = df['Siltation'].str.replace('occasionally','1')
df['Siltation'] = df['Siltation'].str.replace('never','0')
df['Siltation'] = df['Siltation'].astype('int')

In [8]:
# Replacing object in Dynamite to int
df['Dynamite'] = df['Dynamite'].str.replace('high','3')
df['Dynamite'] = df['Dynamite'].str.replace('moderate','2')
df['Dynamite'] = df['Dynamite'].str.replace('low','1')
df['Dynamite'] = df['Dynamite'].str.replace('none','0')
df['Dynamite'] = df['Dynamite'].astype('int')

In [9]:
# Replacing object in Poison to int
df['Poison'] = df['Poison'].str.replace('high','3')
df['Poison'] = df['Poison'].str.replace('moderate','2')
df['Poison'] = df['Poison'].str.replace('low','1')
df['Poison'] = df['Poison'].str.replace('none','0')
df['Poison'] = df['Poison'].astype('int')

In [10]:
# Replacing object in Sewage to int
df['Sewage'] = df['Sewage'].str.replace('high','3')
df['Sewage'] = df['Sewage'].str.replace('moderate','2')
df['Sewage'] = df['Sewage'].str.replace('low','1')
df['Sewage'] = df['Sewage'].str.replace('none','0')
df['Sewage'] = df['Sewage'].astype('int')

In [11]:
# Replacing object in Industrial to int
df['Industrial'] = df['Industrial'].str.replace('high','3')
df['Industrial'] = df['Industrial'].str.replace('moderate','2')
df['Industrial'] = df['Industrial'].str.replace('low','1')
df['Industrial'] = df['Industrial'].str.replace('none','0')
df['Industrial'] = df['Industrial'].astype('int')

In [12]:
# Replacing object in Commercial to int
df['Commercial'] = df['Commercial'].str.replace('high','3')
df['Commercial'] = df['Commercial'].str.replace('moderate','2')
df['Commercial'] = df['Commercial'].str.replace('low','1')
df['Commercial'] = df['Commercial'].str.replace('none','0')
df['Commercial'] = df['Commercial'].astype('int')

In [20]:
# Checking that the datatypes of the variables have been changed
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9111 entries, 0 to 9110
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Bleaching    9111 non-null   int64  
 1   Ocean        9111 non-null   object 
 2   Year         9111 non-null   int64  
 3   Depth        9111 non-null   float64
 4   Storms       9111 non-null   int64  
 5   HumanImpact  9111 non-null   int64  
 6   Siltation    9111 non-null   int64  
 7   Dynamite     9111 non-null   int64  
 8   Poison       9111 non-null   int64  
 9   Sewage       9111 non-null   int64  
 10  Industrial   9111 non-null   int64  
 11  Commercial   9111 non-null   int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 854.3+ KB


# Testing Linear Kernel



In [29]:
# Setting the target and features for the SVM
target = df["Bleaching"]
features = df.drop(["Bleaching","Ocean","Year", "Depth"], axis=1)

# Splitting the data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 10)

In [36]:
# Building the SVM on the train data
linear_model = SVC(kernel='linear')
linear_model.fit(X_train, y_train)
 
linear_prediction = linear_model.predict(X_test)

# Checking the accuracy for train and test data
print("Train score: ", linear_model.score(X_train, y_train))
print("Test score: ", linear_model.score(X_test, y_test))

# Checking the confusion matrix
print("Confusion Matrix:\n",confusion_matrix(linear_prediction, y_test))

Train score:  0.9710482985729967
Test score:  0.9758639605046626
Confusion Matrix:
 [[1779   44]
 [   0    0]]


# Testing RBF Kernel

In [37]:
rbf_model = SVC(kernel='rbf')
rbf_model.fit(X_train, y_train)

rbf_prediction = rbf_model.predict(X_test)

# Checking the accuracy for train and test data
print("Train score: ", rbf_model.score(X_train, y_train))
print("Test score: ", rbf_model.score(X_test, y_test))

# Checking the confusion matrix
print("Confusion Matrix:\n",confusion_matrix(rbf_prediction, y_test))

Train score:  0.9721459934138309
Test score:  0.9758639605046626
Confusion Matrix:
 [[1779   44]
 [   0    0]]


# Testing Polynomial Kernel

In [38]:
poly_model = SVC(kernel='poly')
poly_model.fit(X_train, y_train)

poly_prediction = poly_model.predict(X_test)

# Checking the accuracy for train and test data
print("Train score: ", poly_model.score(X_train, y_train))
print("Test score: ", poly_model.score(X_test, y_test))

# Checking the confusion matrix
print("Confusion Matrix:\n",confusion_matrix(poly_prediction, y_test))

Train score:  0.9726948408342481
Test score:  0.9758639605046626
Confusion Matrix:
 [[1779   44]
 [   0    0]]


# Testing Sigmoid Kernel

In [39]:
sigmoid_model = SVC(kernel='sigmoid')
sigmoid_model.fit(X_train, y_train)

sigmoid_prediction = sigmoid_model.predict(X_test)

# Checking the accuracy for train and test data
print("Train score: ", sigmoid_model.score(X_train, y_train))
print("Test score: ", sigmoid_model.score(X_test, y_test))

# Checking the confusion matrix
print("Confusion Matrix:\n",confusion_matrix(sigmoid_prediction, y_test))

Train score:  0.9596597145993414
Test score:  0.9626988480526605
Confusion Matrix:
 [[1753   42]
 [  26    2]]


# Conclusion

* Linear Test Score: 0.9758639605046626
* RBF Test Score: 0.9758639605046626
* Polynomial Test Score: 0.9758639605046626
* Sigmoid Test Score: 0.9626988480526605

At default values, linear, RBF, and polynomial had the same test score and confusion matrix. The sigmoid kernel produced different results, but lower ones.
