# PRÁCTICA 6 - VÍCTOR CHOZA MERINO - ADRIÁN TURIEL CHARRO
## Parte 2 - Support Vector Machines

### Librerías

In [1]:
# Agrupar aqui al final si eso
import scipy.io as sio
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np

import re
import nltk
import nltk.stem.porter

import codecs
import glob

from sklearn.model_selection import train_test_split

### Funciones del profe

In [2]:
# Archivo 1
def getVocabDict(reverse=False):
    """
    Function to read in the supplied vocab list text file into a dictionary.
    Dictionary key is the stemmed word, value is the index in the text file
    If "reverse", the keys and values are switched.
    """
    vocab_dict = {}
    with open("p6/vocab.txt") as f:
        for line in f:
            (val, key) = line.split()
            if not reverse:
                vocab_dict[key] = int(val)
            else:
                vocab_dict[int(val)] = key

    return vocab_dict

In [3]:
# Archivo 2
def preProcess(email):
    
    hdrstart = email.find("\n\n")
    if hdrstart != -1:
        email = email[hdrstart:]

    email = email.lower()
    # Strip html tags. replace with a space
    email = re.sub('<[^<>]+>', ' ', email)
    # Any numbers get replaced with the string 'number'
    email = re.sub('[0-9]+', 'number', email)
    # Anything starting with http or https:// replaced with 'httpaddr'
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
    # Strings with "@" in the middle are considered emails --> 'emailaddr'
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)
    # The '$' sign gets replaced with 'dollar'
    email = re.sub('[$]+', 'dollar', email)
    return email


def email2TokenList(raw_email):
    """
    Function that takes in a raw email, preprocesses it, tokenizes it,
    stems each word, and returns a list of tokens in the e-mail
    """

    stemmer = nltk.stem.porter.PorterStemmer()
    email = preProcess(raw_email)

    # Split the e-mail into individual words (tokens) 
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]',
                      email)

    # Loop over each token and use a stemmer to shorten it
    tokenlist = []
    for token in tokens:

        token = re.sub('[^a-zA-Z0-9]', '', token)
        stemmed = stemmer.stem(token)
        #Throw out empty tokens
        if not len(token):
            continue
        # Store a list of all unique stemmed words
        tokenlist.append(stemmed)

    return tokenlist

### Cargando datos

In [4]:
vocabulario = getVocabDict()
#vocabulario

In [5]:
def marcandoPalabras(filename, valor):
    
    X = np.zeros([len(filename),len(vocabulario)])
    y = np.full(len(filename), valor)
    
    # Creamos una matriz donde cada fila equivale a un correo
    for j,file in enumerate(filename):
        # Leemos el correo
        email_contents = codecs.open(file,'r', encoding='utf8', errors='ignore').read()
        # Creamos una lista de palabras
        email_contents = email2TokenList(email_contents)
                
        # Marco con 1 si la palabra aparece en el correo
        for i,voc in enumerate(vocabulario.keys()):
            if voc in email_contents:
                X[j,i]=1       
    
    return X,y

In [6]:
#Crea una lista con todos los nombres de los ficheros dentro de una ruta
filesList = np.array(glob.glob('p6/spam/*.txt'))
X1, y1 = marcandoPalabras(filesList, 1) # SPAM
X1.shape

(500, 1899)

In [7]:
filesList = np.array(glob.glob('p6/hard_ham/*.txt'))
X2, y2 = marcandoPalabras(filesList, 0) # NO SPAM
X2.shape

(250, 1899)

In [8]:
filesList = np.array(glob.glob('p6/easy_ham/*.txt'))
X3, y3 = marcandoPalabras(filesList, 0) # NO SPAM
X3.shape

(2551, 1899)

### Cargando datos

In [9]:
SuperX = np.concatenate([X1,X2,X3])
SuperX.shape

(3301, 1899)

In [10]:
SuperY = np.concatenate([y1,y2,y3])
SuperY.shape

(3301,)

In [90]:
# Partimos el conjunto de datos en entrenamiento (70%) y test (30%)
X_train, X_test, y_train, y_test = train_test_split(
    SuperX, SuperY, test_size=0.3, random_state=42)

In [91]:
print ('Training Linear SVM (Spam Classification)...')
C_vec = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]

for i, C_i in enumerate(C_vec):
    svm1 = SVC(kernel='linear', C=C_i)
    svm1.fit(X_train, y_train.ravel())
    p = accuracy_score(y_test, svm1.predict(X_test))
    print('C=',C_i)
    print ('Precisión de entrenamiento:', p)


Training Linear SVM (Spam Classification)...
C= 0.01
Precisión de entrenamiento: 0.9798183652875883
C= 0.03
Precisión de entrenamiento: 0.9798183652875883
C= 0.1
Precisión de entrenamiento: 0.9798183652875883
C= 0.3
Precisión de entrenamiento: 0.9757820383451059
C= 1
Precisión de entrenamiento: 0.9788092835519677
C= 3
Precisión de entrenamiento: 0.9717457114026236
C= 10
Precisión de entrenamiento: 0.9677093844601413
C= 30
Precisión de entrenamiento: 0.9677093844601413


In [92]:
C_vec = [0.3, 1, 3, 10, 30]
sigma_vec = [3, 10, 30]

for i, C_i in enumerate(C_vec):
    for j, sigma_j in enumerate (sigma_vec):
        svm3 =  SVC(kernel='rbf', C=C_i, gamma=1/(2*sigma_j**2))
        svm3.fit(X_train, y_train.ravel())
        print('C=',C_i,' sigma=',sigma_j)
        print(accuracy_score(y_test, svm3.predict(X_test)))


C= 0.3  sigma= 3
0.8506559031281534
C= 0.3  sigma= 10
0.9182643794147326
C= 0.3  sigma= 30
0.8244197780020182
C= 1  sigma= 3
0.9112008072653885
C= 1  sigma= 10
0.9757820383451059
C= 1  sigma= 30
0.8859737638748738
C= 3  sigma= 3
0.9212916246215943
C= 3  sigma= 10
0.9798183652875883
C= 3  sigma= 30
0.9475277497477296
C= 10  sigma= 3
0.9202825428859738
C= 10  sigma= 10
0.9798183652875883
C= 10  sigma= 30
0.9808274470232089
C= 30  sigma= 3
0.9192734611503531
C= 30  sigma= 10
0.9778002018163471
C= 30  sigma= 30
0.9798183652875883
