In [8]:
import sys
sys.path.append("../tools/")

import pickle
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import pylab as pl
import pandas as pd
import numpy as np
import warnings
import random
import os
from PIL import Image
from scipy import stats
from sklearn import model_selection as ms
from time import time

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn import datasets
from custom_procs import *
from email_preprocess import preprocess

# Study examples

## Udacity sample

In [None]:
features_train, labels_train, features_test, labels_test = makeTerrainData()

In [None]:
# fit classifier
clf = SVC(C=10000, gamma = 1)
clf.fit(features_train, labels_train)

# score and decision boundary
img_name = 'SVM_plot'
test_points = prettyPicture(clf, features_test, labels_test, 
                            img_title = img_name)
print(f'Accuracy score: {clf.score(features_test,labels_test)}')
Image.open(f'{img_name}.png')

## Sklearn sample

In [34]:
ds = datasets.load_iris()

In [35]:
ds.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [36]:
ds['data'].shape, ds['target'].shape

((150, 4), (150,))

In [37]:
{k:v for k,v in zip(np.unique(ds['target']),ds['target_names'])}

{0: 'setosa', 1: 'versicolor', 2: 'virginica'}

In [38]:
print(ds['DESCR'])

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

In [39]:
X_train, X_test,\
y_train, y_test = ms.train_test_split(ds['data'],list(ds['target']))

In [42]:
clf = SVC()
clf.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [44]:
clf.score(X_test,y_test)

1.0

# Mini-project

This is the code to accompany the Lesson 2 (SVM) mini-project.

Use a SVM to identify emails from the Enron corpus by their authors:    
- Sara has label 0
- Chris has label 1

In [34]:
features_train, features_test, labels_train, labels_test = preprocess()

no. of Chris training emails: 7936
no. of Sara training emails: 7884


In [35]:
features_train.shape, len(labels_train)

((15820, 3785), 15820)

## Linear kernel with raw features

In [5]:
clf = SVC(kernel = 'linear')
clf.fit(features_train, labels_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [6]:
clf.score(features_test, labels_test)

0.9840728100113766

## Linear kernel with standardised and consolidated features

In [9]:
scaler = StandardScaler()
scaler.fit(features_train)
features_train = scaler.transform(features_train)
features_test = scaler.transform(features_test)

In [10]:
pca = PCA(0.95)

In [11]:
pca.fit(features_train)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [12]:
pca.n_components_

2077

In [13]:
features_train = pca.transform(features_train)
features_test = pca.transform(features_test)

In [20]:
clf = SVC(kernel = 'linear')
clf.fit(features_train, labels_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
clf.score(features_test, labels_test)

0.9920364050056882

## Linear kernel with 1/100 raw features

In [28]:
features_train = features_train[:int(len(features_train)/100)] 
labels_train = labels_train[:int(len(labels_train)/100)] 

In [30]:
clf = SVC(kernel = 'linear')
clf.fit(features_train, labels_train)
clf.score(features_test, labels_test)

0.8845278725824801

## RBF kernel with 1/100 raw features

In [31]:
clf = SVC(kernel = 'rbf')
clf.fit(features_train, labels_train)
clf.score(features_test, labels_test)

0.6160409556313993

In [33]:
clf = SVC(kernel = 'rbf', C = 10000)
clf.fit(features_train, labels_train)
clf.score(features_test, labels_test)

0.8924914675767918

## RBF kernel using PCA to reduce to 10 PC

In [36]:
scaler = StandardScaler()
scaler.fit(features_train)
features_train = scaler.transform(features_train)
features_test = scaler.transform(features_test)

In [37]:
pca = PCA(0.7)
pca.fit(features_train)

PCA(copy=True, iterated_power='auto', n_components=0.7, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [38]:
pca.n_components_

788

In [39]:
features_train = pca.transform(features_train)
features_test = pca.transform(features_test)

In [None]:
clf = SVC(kernel = 'rbf')
clf.fit(features_train, labels_train)
clf.score(features_test, labelst_test)