In [37]:
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train_data = np.load('data_lab5/data/training_sentences.npy', allow_pickle=True)
train_labels = np.load('data_lab5/data/training_labels.npy', allow_pickle=True)
test_data = np.load('data_lab5/data/test_sentences.npy', allow_pickle=True)
test_labels = np.load('data_lab5/data/test_labels.npy', allow_pickle=True)

In [31]:
# 1
def normalize_data(train_data, test_data, type=None):
    if type is None:
        return train_data, test_data
    
    elif type == 'standard':
        mean_train_data = np.mean(train_data, axis=0)
        std_train_data = np.std(train_data, axis=0)
        normalized_train_data = np.divide(np.subtract(train_data, mean_train_data), std_train_data)
        normalized_test_data = np.divide(np.subtract(test_data, mean_train_data), std_train_data)
        return normalized_train_data, normalized_test_data
    
    elif type == 'l1':
        train_norm = np.sum(np.abs(train_data), axis=0)
        normalized_train_data = np.divide(train_data, train_norm)
        test_norm = np.sum(np.abs(test_data), axis=0)
        normalized_test_data = np.divide(test_data, test_norm)
        return normalized_train_data, normalized_test_data
    
    elif type == 'l2':
        train_norm = np.sqrt(np.sum(np.square(train_data), axis=0))
        train_norm = np.add(train_norm, 1e-10)
        normalized_train_data = np.divide(train_data, train_norm)
        test_norm = np.sqrt(np.sum(np.square(test_data), axis=0))
        test_norm = np.add(test_norm, 1e-10)
        normalized_test_data = np.divide(test_data, test_norm)
        return normalized_train_data, normalized_test_data

In [32]:
# 2
class BagOfWords:
    def __init__(self):
        self.vocab = {}
    
    def build_vocabulary(self, data):
        idx = 0
        words = []
        for message in data:
            for word in message:
                if word not in self.vocab:
                    self.vocab[word] = idx
                    words.append(word)
                    idx += 1
                    
    def get_features(self, data):
        features = np.zeros((len(data), len(self.vocab)))
        for i, message in enumerate(data):
            for word in message:
                if word in self.vocab:
                    features[i, self.vocab[word]] += 1
        return features

In [33]:
bag = BagOfWords()
bag.build_vocabulary(train_data)
train_features = bag.get_features(train_data)
test_features = bag.get_features(test_data)
normalized_train_features, normalized_test_features = normalize_data(train_features, test_features, 'l2')
print(len(bag.vocab))

9522


In [34]:
# 6
svm_model = svm.SVC(kernel='linear', C=1)
svm_model.fit(normalized_train_features, train_labels)
test_predictions = svm_model.predict(normalized_test_features)
accuracy = accuracy_score(test_labels, test_predictions)
f1 = f1_score(test_labels, test_predictions)
print('Accuracy:', accuracy)
print('F1:', f1)

Accuracy: 0.9815217391304348
F1: 0.9285714285714286


In [35]:
# 7
coefficients = svm_model.coef_
sorted_indices = np.argsort(coefficients)
print(sorted_indices)
    
print('Most negative words:')
for i in range(10):
    index = sorted_indices[0, -i-1]
    print(list(bag.vocab.keys())[index])
    
print('Most positive words:')
for i in range(10):
    index = sorted_indices[0, i]
    print(list(bag.vocab.keys())[index])

[[  90  334  420 ... 8595  114 1355]]
Most negative words:
voicemail
mobile
08719181513
urgent
won
Call
08714712388
84484
ringtoneking
08718738034
Most positive words:
me
Im
work
class
sir
taken
I
him
appreciate
pick


In [41]:
# 8
tfidf_vectorizer = TfidfVectorizer()
train_data = [' '.join(message) for message in train_data]
tfidf_train_features = tfidf_vectorizer.fit_transform(train_data)

test_data = [' '.join(message) for message in test_data]
tfidf_test_features = tfidf_vectorizer.transform(test_data)
normalized_tfidf_train_features, normalized_tfidf_test_features = normalize_data(tfidf_train_features, tfidf_test_features, 'l2')

svm_model.fit(normalized_tfidf_train_features, train_labels)
test_predictions = svm_model.predict(normalized_tfidf_test_features)
accuracy = accuracy_score(test_labels, test_predictions)
f1 = f1_score(test_labels, test_predictions)

print('Accuracy:', accuracy)
print('F1:', f1)

ValueError: empty vocabulary; perhaps the documents only contain stop words