In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

In [2]:
def normalize_data(train_data, test_data, type=None):
    if type =='standard': 
        mean = np.mean(train_data,axis=0) # calculeaza mediile atributelor din datele de antrenare
        std = np.std(train_data,axis=0) # calculeaza deviatiile standard din datele de antrenare

        train_data -= mean
        train_data/=std

        test_data -= mean
        test_data /=std
    elif type == 'l1':
        train_data = train_data / (np.expand_dims((np.linalg.norm(train_data, ord=1, axis=1)), axis=1) + 1e-6) # normalizarea L1.
        # Iar np.expand_dims e folosit pentru a putea folosi broadcast(primul laborator pentru detalii) la impartire.
        # Mai exact, np.linalg.norm ne intoarce norma 1(pentru ca am setat ord =1) pentru fiecare exemplu din setul de date.
        # Deci output-ul este un vector de norme de dimensiune egala cu numarul de exemple din setul de date si pentru a folosi vectorul acesta la impartire e nevoie sa il
        # transformam in forma (nr_exemple, 1)
        test_data = test_data / (np.expand_dims((np.linalg.norm(test_data, ord=1,axis=1)), axis=1) + 1e-6)
    elif type == 'l2':
        norm_train = np.expand_dims((np.linalg.norm(train_data,ord=2,axis=1)), axis=1) # identic cu cazul l1, dar parametrul ord este setat la 2, pentru a calcula norma l2
        train_data = train_data / (norm_train + 1e-6)
        
        test_data = test_data / (np.expand_dims((np.linalg.norm(test_data, ord=2,axis=1)), axis=1) +1e-6)

    return train_data, test_data

In [3]:
def normalize_data_v2(train_data, test_data, type=None):
    if type=='standard':
        scaler = StandardScaler()
        scaler.fit(train_data)
        
        train_data = scaler.transform(train_data)
        test_data = scaler.transform(test_data)
    elif type=='l2':
        normalizer = Normalizer(norm='l2')
        
        train_data = normalizer.transform(train_data)
        test_data = normalizer.transform(test_data)
    elif type =='l1':
        normalizer = Normalizer(norm='l1')
        
        train_data = normalizer.transform(train_data)
        test_data = normalizer.transform(test_data)
        
    return train_data, test_data

In [4]:
class BagOfWords:
    # a. Initialize the vocabulary in the constructor (it should be empty).
    def __init__(self):
        self.vocabulary = {}
        self.vocabulary_length = 0
        self.words = []
    
    # b. Implement the 'build_vocabulary' function that takes as input the training essays (numpy array of lists) 
    # and build the vocabulary.
    def build_vocabulary(self, data):
        for essay in data:
            for word in essay:
                if word not in self.vocabulary.keys():
                    self.vocabulary[word] = len(self.vocabulary)
                    self.words.append(word)

        self.vocabulary_length = len(self.vocabulary)
        print(self.vocabulary_length)
    
    # c. Implement the 'get_features' function that takes as input the essays and returns the BOW reprezentations 
    # (it should be a matrix of NxD, where N is the number of essays and D the length of the dictionary).
    def get_features(self, data):
        features = np.zeros((len(data), self.vocabulary_length))

        for essay_idx, essay in enumerate(data):
            for word in essay:
                if word in self.vocabulary.keys():
                    features[essay_idx, self.vocabulary[word]] += 1
        return features 

In [5]:
# Read data
train_data = np.load('data/training_sentences.npy', allow_pickle=True)
train_labels = np.load('data/training_labels.npy')
test_data = np.load('data/test_sentences.npy', allow_pickle=True)
test_labels = np.load('data/test_labels.npy')

In [6]:
bow = BagOfWords()
bow.build_vocabulary(train_data)

bow_training = bow.get_features(train_data)
bow_testing = bow.get_features(test_data)

9522


In [7]:
normalized_bow_train, normalized_bow_test = normalize_data(bow_training, bow_testing, 'l2')

In [8]:
svc = SVC(C=1.0, kernel='linear')

svc.fit(normalized_bow_train, train_labels)

In [9]:
y_pred = svc.predict(normalized_bow_test)

print(f'Accuracy: {accuracy_score(y_true=test_labels, y_pred=y_pred)}')
print(f'F1-score: {f1_score(y_true=test_labels, y_pred=y_pred)}')

Accuracy: 0.9842391304347826
F1-score: 0.9409368635437881


Mesajele spam au label-ul 1, iar cele non-spam au label-ul 0 (pentru noi 0 inseamna de fapt -1, asa categorisam exemplele negative). 

Functia de decizie a svm-ului avea forma wx+b = w1x1+w2x2+ ... + wnxn + b. Si spuneam ca predictia este 1(spam) daca wx+b este mai mare decat 0 si -1(non-spam) daca valoarea functiei este mai mica decat 0. Asta inseamna ca w-urile cu valorile ridicate corespund cuvintelor importante pentru predictia 1 (spam), iar w-urile cu valori mici(chiar negative) corespund cuvintelor care favorizeaza predictia -1(non-spam). Deci pentru a gasi cuvintele cele mai negative, ne uitam la coeficientii w si luam cuvintele care corespund unor w-uri mari (adica favorizeaza spam). Si invers pentru cuvintele cele mai pozitive, ne uitam la coeficientii w si luam cuvintele care corespund unor w-uri mici (adica favorizeaza non-spam).

Coeficientii(w) ii accesam cu atributul coef_[0], si putem folosi functia argsort din numpy pentru a lua indecsii (care corespund si cuvintelor) in ordinea crescatoare a coeficientilor. Apoi primele 10 si ultimele 10 cuvinte din vocabular (in ordinea inserarii lor) sunt cuvintele cerute de exercitiu.



#### Observatie:
 Acest rationament este valabil pentru ca folosim pentru svm kernel-ul 'linear', daca foloseam 'rbf' sau altceva era invalid deoarece functia de decizie nu mai avea aceeasi interpretabilitate.

In [10]:
index_sort = np.argsort(svc.coef_[0])
bow.words = np.array(bow.words)
print("Most positive words:", bow.words[index_sort[:10]])
print("Most negative words:", bow.words[index_sort[-10:]])

Most positive words: ['&lt#&gt' 'me' 'i' 'Going' 'him' 'Ok' 'I' 'Ill' 'my' 'Im']
Most negative words: ['Text' 'To' 'mobile' 'CALL' 'FREE' 'txt' '&' 'Call' 'Txt' 'STOP']
