# Session 13 - Kernel SVM

## Kernel parameterizations
- **Linear kernel:** $K(x,z)=x^Tz$ with $\phi_{linear}(x)=x$.
- **Quadratic kernel:** $K(x,z)=(x^Tz+1)^2$. For $x=(x_1,x_2)$, one valid mapping is:
  $$\phi_{quad}(x)=[x_1^2,\sqrt{2}x_1x_2,x_2^2,\sqrt{2}x_1,\sqrt{2}x_2,1]$$
  so $\phi_{quad}(x)^T\phi_{quad}(z)=(x^Tz+1)^2$.


In [1]:
data_link = "https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt"

In [6]:
import re
from io import StringIO
from pathlib import Path
import pandas as pd
import requests

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

COLS = ['variance', 'skewness', 'curtosis', 'entropy', 'class']

def _normalize_columns(df):
    if df.shape[1] == 5:
        if 'class' not in [str(c).lower() for c in df.columns]:
            df.columns = COLS
    return df

def _try_parse_text(text):
    # Try comma-separated first, then whitespace-separated.
    for kwargs in ({}, {'sep': r'\s+', 'engine': 'python'}):
        try:
            df = pd.read_csv(StringIO(text), **kwargs)
            if df.shape[1] == 5:
                return _normalize_columns(df)
        except Exception:
            pass
    return None

def load_banknote_data(source):
    if str(source).startswith(('http://', 'https://')):
        r = requests.get(source, timeout=20)
        r.raise_for_status()
        text = r.text

        # Direct parse as CSV or whitespace text.
        df = _try_parse_text(text)
        if df is not None:
            return df

        # Try HTML table parse.
        try:
            tables = pd.read_html(text)
            for t in tables:
                if t.shape[1] >= 5:
                    t = t.iloc[:, :5].copy()
                    t.columns = COLS
                    return t
        except Exception:
            pass

        # Try extracting linked data files from HTML.
        links = re.findall(r'href=["\']([^"\']+)', text, flags=re.IGNORECASE)
        for link in links:
            if any(ext in link.lower() for ext in ['.csv', '.txt', '.data']):
                candidate = requests.compat.urljoin(source, link)
                rr = requests.get(candidate, timeout=20)
                if rr.ok:
                    df = _try_parse_text(rr.text)
                    if df is not None:
                        return df

    raise ValueError('Could not parse dataset from data_link. Provide a direct file path/URL to tabular data.')

bankdata = load_banknote_data(data_link)
X = bankdata.drop('class', axis=1)
y = bankdata['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print('Loaded shape:', bankdata.shape)
display(bankdata.head())


Loaded shape: (1371, 5)


Unnamed: 0,variance,skewness,curtosis,entropy,class
0,4.5459,8.1674,-2.4586,-1.4621,0
1,3.866,-2.6383,1.9242,0.10645,0
2,3.4566,9.5228,-4.0112,-3.5944,0
3,0.32924,-4.4552,4.5718,-0.9888,0
4,4.3684,9.6718,-3.9606,-3.1625,0


In [3]:
# Simple kernel SVM (RBF)
rbf_model = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42))
])

rbf_model.fit(X_train, y_train)
rbf_pred = rbf_model.predict(X_test)
rbf_acc = accuracy_score(y_test, rbf_pred)

print(f'RBF accuracy: {rbf_acc:.4f}')
print('Confusion matrix:\n', confusion_matrix(y_test, rbf_pred))
print('Classification report:\n', classification_report(y_test, rbf_pred))


RBF accuracy: 1.0000
Confusion matrix:
 [[153   0]
 [  0 122]]
Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       153
           1       1.00      1.00      1.00       122

    accuracy                           1.00       275
   macro avg       1.00      1.00      1.00       275
weighted avg       1.00      1.00      1.00       275



In [4]:
# Compare linear, quadratic (poly degree=2), and radial basis kernels
models = {
    'linear': SVC(kernel='linear', C=1.0, random_state=42),
    'quadratic': SVC(kernel='poly', degree=2, coef0=1.0, C=1.0, gamma='scale', random_state=42),
    'rbf': SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),
}

rows = []
for name, svc in models.items():
    clf = Pipeline([('scaler', StandardScaler()), ('svc', svc)])
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    rows.append((name, accuracy_score(y_test, pred)))

results = pd.DataFrame(rows, columns=['kernel', 'accuracy']).sort_values('accuracy', ascending=False)
display(results)
best = results.iloc[0]
print(f"Largest contribution to accuracy: {best['kernel']} (accuracy={best['accuracy']:.4f})")


Unnamed: 0,kernel,accuracy
1,quadratic,1.0
2,rbf,1.0
0,linear,0.985455


Largest contribution to accuracy: quadratic (accuracy=1.0000)


In [None]:
# YOSEMITE DATA
import glob
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score, r2_score
from sklearn.linear_model import LinearRegression

def load_yosemite_dataframe():
    txt_files = sorted(glob.glob('yosemite-temperatures/yosemite_village/*.txt'))

    cols = [
        'WBANNO','UTC_DATE','UTC_TIME','LST_DATE','LST_TIME','CRX_VN','LONGITUDE','LATITUDE',
        'AIR_TEMPERATURE','PRECIPITATION','SOLAR_RADIATION','SR_FLAG','SURFACE_TEMPERATURE','ST_TYPE',
        'ST_FLAG','RELATIVE_HUMIDITY','RH_FLAG','SOIL_MOISTURE_5','SOIL_TEMPERATURE_5','WETNESS',
        'WET_FLAG','WIND_1_5','WIND_FLAG'
    ]

    frames = []
    for f in txt_files:
        dfi = pd.read_csv(f, sep=r'\s+', engine='python', header=None, names=cols)
        frames.append(dfi)

    df = pd.concat(frames, ignore_index=True)
    print(f'Loaded raw Yosemite files: {len(txt_files)} files')
    return df

yosemite = load_yosemite_dataframe()

df = yosemite[['AIR_TEMPERATURE', 'LST_DATE']].copy()
df['AIR_TEMPERATURE'] = pd.to_numeric(df['AIR_TEMPERATURE'], errors='coerce')
df = df[(df['AIR_TEMPERATURE'].notna()) & (df['AIR_TEMPERATURE'] > -900)]

date_str = df['LST_DATE'].astype(str).str.replace('.0', '', regex=False)
parsed = pd.to_datetime(date_str, format='%Y%m%d', errors='coerce')
df = df[parsed.notna()].copy()
df['month'] = parsed[parsed.notna()].dt.month.values

# Downsample for tractable runtime.
MAX_ROWS = 5000
if len(df) > MAX_ROWS:
    df = df.sample(n=MAX_ROWS, random_state=42)

X = df[['AIR_TEMPERATURE']].values
y = df['month'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

models = {
    'linear': SVC(kernel='linear', C=1, random_state=42),
    'quadratic': SVC(kernel='poly', degree=2, coef0=1, C=1, gamma='scale', random_state=42),
    'rbf': SVC(kernel='rbf', C=1, gamma='scale', random_state=42),
}

svm_results = []
best_name, best_acc = None, -1
for name, svc in models.items():
    pipe = Pipeline([('scaler', StandardScaler()), ('svc', svc)])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, pred)
    svm_results.append({'kernel': name, 'test_accuracy': acc})
    if acc > best_acc:
        best_name, best_acc = name, acc

svm_df = pd.DataFrame(svm_results).sort_values('test_accuracy', ascending=False)
print('SVM month-classification using raw temperature:')
print(svm_df.to_string(index=False))
print(f'Best SVM accuracy: {best_acc:.4f} ({best_name} kernel)')

svr = Pipeline([('scaler', StandardScaler()), ('svr', SVR(kernel='rbf', C=10, gamma='scale', epsilon=0.1))])
svr.fit(X_train, y_train)
svr_r2 = r2_score(y_test, svr.predict(X_test))

lin = LinearRegression()
lin.fit(X_train, y_train)
lin_r2 = r2_score(y_test, lin.predict(X_test))

print(f'SVR R^2: {svr_r2:.4f}')
print(f'Linear model R^2: {lin_r2:.4f}')
if svr_r2 > lin_r2:
    print('SVR outperforms the linear model on R^2 for this split.')
elif svr_r2 < lin_r2:
    print('Linear model outperforms SVR on R^2 for this split.')
else:
    print('SVR and linear model tie on R^2 for this split.')


Loaded raw Yosemite files: 6 files
SVM month-classification using raw temperature:
   kernel  test_accuracy
      rbf          0.199
quadratic          0.196
   linear          0.186
Best SVM accuracy: 0.1990 (rbf kernel)
SVR R^2: -0.1240
Linear model R^2: 0.0336
Linear model outperforms SVR on R^2 for this split.
