# Lulu and Bea's Hw3

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing as sk_preprocessing
from sklearn.model_selection import train_test_split

In [69]:
file_path = './seeds_dataset.txt'
column_names = ['area', 'parameters', 'compactness', 'length_of_kernel', 'width_of_kernel', 'asymmetry_coefficient', 'length_of_kernel_groove', 'class']
df = pd.read_csv(file_path, sep=r'\s+', names=column_names, header=None)

In [70]:
df.head()

Unnamed: 0,area,parameters,compactness,length_of_kernel,width_of_kernel,asymmetry_coefficient,length_of_kernel_groove,class
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1


In [71]:
df.shape

(210, 8)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   area                     210 non-null    float64
 1   parameters               210 non-null    float64
 2   compactness              210 non-null    float64
 3   length_of_kernel         210 non-null    float64
 4   width_of_kernel          210 non-null    float64
 5   asymmetry_coefficient    210 non-null    float64
 6   length_of_kernel_groove  210 non-null    float64
 7   class                    210 non-null    int64  
dtypes: float64(7), int64(1)
memory usage: 13.3 KB


In [73]:
classes = df['class'].unique()

train_frames = []
test_frames = []

for cls in classes:
    
    class_subset = df[df['class'] == cls]
    
    class_subset = class_subset.sample(frac=1).reset_index(drop=True)

    train_frames.append(class_subset.iloc[:50])
    test_frames.append(class_subset.iloc[50:70])

train_df = pd.concat(train_frames).reset_index(drop=True)
test_df = pd.concat(test_frames).reset_index(drop=True)

train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

X_train = train_df.drop('class', axis=1)
y_train = train_df['class']
X_test = test_df.drop('class', axis=1)
y_test = test_df['class']

In [74]:
X_train.shape

(150, 7)

In [75]:
X_test.shape

(60, 7)

In [76]:
scaler = sk_preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [77]:
svc = SVC()
svc.fit(X_train, y_train.values.ravel())
y_pred = svc.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[18  2  0]
 [ 1 19  0]
 [ 0  0 20]]
              precision    recall  f1-score   support

           1       0.95      0.90      0.92        20
           2       0.90      0.95      0.93        20
           3       1.00      1.00      1.00        20

    accuracy                           0.95        60
   macro avg       0.95      0.95      0.95        60
weighted avg       0.95      0.95      0.95        60



In [78]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.95


In [79]:
def accuracy(confusion_matrix):

    diagonal_sum = confusion_matrix.trace()
    sum_of_all_elements = confusion_matrix.sum()
    return diagonal_sum / sum_of_all_elements

svc.fit(X_train, y_train.values.ravel())
y_pred = svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
accuracy_value = accuracy(cm)
testing_error = 1 - accuracy_value
print("Testing Error is:", testing_error)

Testing Error is: 0.050000000000000044


In [80]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train SVM with Linear Kernel
linear_svc = SVC(kernel='linear')
linear_svc.fit(X_train, y_train)
linear_y_pred = linear_svc.predict(X_test)

# Train SVM with Non-linear Kernel (e.g., RBF kernel)
non_linear_svc = SVC(kernel='rbf')
non_linear_svc.fit(X_train, y_train)
non_linear_y_pred = non_linear_svc.predict(X_test)

# Evaluate Linear SVM
linear_accuracy = accuracy_score(y_test, linear_y_pred)
linear_report = classification_report(y_test, linear_y_pred)
linear_cm = confusion_matrix(y_test, linear_y_pred)

# Evaluate Non-linear SVM
non_linear_accuracy = accuracy_score(y_test, non_linear_y_pred)
non_linear_report = classification_report(y_test, non_linear_y_pred)
non_linear_cm = confusion_matrix(y_test, non_linear_y_pred)

# Print or visualize the evaluation results
print("Linear SVM Accuracy:", linear_accuracy)
print("Linear SVM Classification Report:\n", linear_report)
print("Linear SVM Confusion Matrix:\n", linear_cm)

print("\nNon-linear SVM Accuracy:", non_linear_accuracy)
print("Non-linear SVM Classification Report:\n", non_linear_report)
print("Non-linear SVM Confusion Matrix:\n", non_linear_cm)


Linear SVM Accuracy: 0.9166666666666666
Linear SVM Classification Report:
               precision    recall  f1-score   support

           1       0.89      0.85      0.87        20
           2       0.90      0.90      0.90        20
           3       0.95      1.00      0.98        20

    accuracy                           0.92        60
   macro avg       0.92      0.92      0.92        60
weighted avg       0.92      0.92      0.92        60

Linear SVM Confusion Matrix:
 [[17  2  1]
 [ 2 18  0]
 [ 0  0 20]]

Non-linear SVM Accuracy: 0.95
Non-linear SVM Classification Report:
               precision    recall  f1-score   support

           1       0.95      0.90      0.92        20
           2       0.90      0.95      0.93        20
           3       1.00      1.00      1.00        20

    accuracy                           0.95        60
   macro avg       0.95      0.95      0.95        60
weighted avg       0.95      0.95      0.95        60

Non-linear SVM Confusion M