In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = datasets.load_iris() #from scikit learn directly
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)

In [2]:
# Standardize the features
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [3]:
# Create a Support Vector Classifier with the Radial basis function (RBF) kernel
svm = SVC(kernel='rbf', C=1.0, random_state=42)

# Train the model using the training data
svm.fit(X_train_std, y_train)

In [4]:
# Make predictions using the test data
y_pred = svm.predict(X_test_std)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9777777777777777


# Detecting Outliers using SVM

In [5]:
import pandas as pd
import numpy as np
from sklearn import svm

# load Titanic dataset
df = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [6]:
# drop irrelevant columns and missing values
df.drop(['Name'], axis=1, inplace=True)
df.dropna(inplace=True)

# convert categorical variables to numerical
df['Sex'] = pd.factorize(df['Sex'])[0]

# split features and target variable
X = df.drop('Survived', axis=1).values
y = df['Survived'].values

In [7]:
# train an SVM model
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X)

# predict class labels for all data points
y_pred = clf.predict(X)

# calculate the distance of each data point from the decision boundary
distances = clf.decision_function(X)


In [9]:
# sort the distances in descending order
sorted_idx = np.argsort(distances)[::-1]

# set a threshold for the distance
threshold = np.percentile(distances, 5)

# identify the data points that have a distance above the threshold as outliers
outliers = df.iloc[distances < threshold]

print("Number of outliers:", len(outliers))
print("Outliers:", outliers)

Number of outliers: 44
Outliers:      Survived  Pclass  Sex    Age  Siblings/Spouses Aboard  \
6           0       1    0  54.00                        0   
77          1       2    0   0.83                        0   
119         0       2    0  21.00                        2   
124         1       3    0  12.00                        1   
163         0       3    0   1.00                        4   
165         1       1    1  48.00                        0   
167         0       1    0  60.00                        0   
179         0       3    1   8.00                        8   
217         1       1    1  32.00                        0   
225         1       2    0  19.00                        0   
235         0       2    0  44.00                        1   
237         0       2    0  19.00                        0   
243         0       3    0  30.00                        0   
269         1       3    0  25.00                        0   
274         0       3    1  45.00    

# Outlier detection in tips dataset

In [10]:
# load Tips dataset
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv')


In [11]:
# drop irrelevant columns
df.drop(['day', 'time'], axis=1, inplace=True)

# convert categorical variables to numerical
df['sex'] = pd.factorize(df['sex'])[0]
df['smoker'] = pd.factorize(df['smoker'])[0]

In [12]:
# split features and target variable
X = df.drop('tip', axis=1).values
y = df['tip'].values

In [13]:
# train an SVM model
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) 
clf.fit(X)

# predict class labels for all data points
y_pred = clf.predict(X)

# calculate the distance of each data point from the decision boundary
distances = clf.decision_function(X)

# sort the distances in descending order
sorted_idx = np.argsort(distances)[::-1]

# set a threshold for the distance
threshold = np.percentile(distances, 5)

# identify the data points that have a distance above the threshold as outliers
outliers = df.iloc[distances < threshold]

print("Number of outliers:", len(outliers))
print("Outliers:", outliers)

Number of outliers: 13
Outliers:      total_bill    tip  sex  smoker  size
67         3.07   1.00    0       1     1
88        24.71   5.85    1       0     2
93        16.32   4.30    0       1     2
141       34.30   6.70    1       0     6
142       41.19   5.00    1       0     5
143       27.05   5.00    0       0     6
156       48.17   5.00    1       0     6
159       16.49   2.00    1       0     4
170       50.81  10.00    1       1     3
184       40.55   3.00    1       1     2
185       20.69   5.00    1       0     5
203       16.40   2.50    0       1     2
240       27.18   2.00    0       1     2
