# Laboratorul 4 

## Modelul bag-of-words

## Normalizarea datelor

In [1]:
from sklearn import preprocessing
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score

In [2]:
x_train = np.array([[1, -1, 2], [2, 0, 0], [0, 1, -1]], dtype=np.float64)
x_test = np.array([[-1, 1, 0]], dtype=np.float64)

# facem statisticile pe datele de antrenare
scaler = preprocessing.StandardScaler()
scaler.fit(x_train)

# afisam media
print(scaler.mean_) # => [1. 0. 0.33333333]
# afisam deviatia standard
print(scaler.scale_)

# scalam datele de antrenare
scaled_x_train = scaler.transform(x_train)
print(scaled_x_train)

# scalam datele de test
scaled_x_test = scaler.transform(x_test)
print(scaled_x_test)

[1.         0.         0.33333333]
[0.81649658 0.81649658 1.24721913]
[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]
[[-2.44948974  1.22474487 -0.26726124]]


In [3]:
train_data = np.load("data/training_sentences.npy", allow_pickle=True)
train_labels = np.load("data/training_labels.npy", allow_pickle=True)

test_data = np.load("data/test_sentences.npy", allow_pickle=True)
test_labels = np.load("data/test_labels.npy", allow_pickle=True)

## Ex2 

In [39]:
def normalize_data(train_data, test_data, type=None):
    scaler = None
    if type == "standard":
        mean = np.mean(train_data, axis=0)
        std = np.std(train_data, axis=0) + 10 ** -8
        scaled_train_data = (train_data - mean) / std
        scaled_test_data = (test_data - mean) / std
    elif type == "l1":
        norm = np.sum(np.abs(train_data), axis=1, keepdims=True) + 10 ** -8
        scaled_train_data = train_data / norm
        norm = np.sum(np.abs(test_data), axis=1, keepdims=True) + 10 ** -8
        scaled_test_data = test_data / norm
    elif type == "l2":
        norm = np.sqrt(np.sum(train_data ** 2, axis=1, keepdims=True)) + 10 ** -8
        scaled_train_data = train_data / norm
        norm = np.sqrt(np.sum(test_data ** 2, axis=1, keepdims=True)) + 10 ** -8
        scaled_test_data = test_data / norm
    else:
        raise Exception("Invalid type. Must be None, standard, l1 or l2.")
    
    if type is not None:
        return scaled_train_data, scaled_test_data
    else:
        return train_data, test_data

## Ex3

In [17]:
class BagOfWords:
    def __init__(self):
        self.vocabulary = {}
        self.word_list = []
    
    def build_vocabulary(self, data):
        word_id = 0
        for message in data:
            for word in message:
                if word not in self.vocabulary:
                    self.vocabulary[word] = word_id
                    self.word_list.append(word)
                    word_id += 1
                    

bag_of_words = BagOfWords()
bag_of_words.build_vocabulary(train_data)
print(len(bag_of_words.vocabulary))

9522


## Ex4 

In [18]:
class BagOfWords:
    def __init__(self):
        self.vocabulary = {}
        self.word_list = []
    
    def build_vocabulary(self, data):
        word_id = 0
        for message in data:
            for word in message:
                if word not in self.vocabulary:
                    self.vocabulary[word] = word_id
                    self.word_list.append(word)
                    word_id += 1
    
    def get_features(self, data):
        features = np.zeros((len(data), len(self.vocabulary)))
        
        for i, message in enumerate(data):
            for word in message:
                if word in self.vocabulary:
                    word_id = self.vocabulary[word]
                    features[i, word_id] += 1
        
        return features
                                       

## Ex5 

In [40]:
class BagOfWords:
    def __init__(self):
        self.vocabulary = {}
        self.word_list = []
    
    def build_vocabulary(self, data):
        word_id = 0
        for message in data:
            for word in message:
                if word not in self.vocabulary:
                    self.vocabulary[word] = word_id
                    self.word_list.append(word)
                    word_id += 1
    
    def get_features(self, data):
        features = np.zeros((len(data), len(self.vocabulary)))
        
        for i, message in enumerate(data):
            for word in message:
                if word in self.vocabulary:
                    word_id = self.vocabulary[word]
                    features[i, word_id] += 1
        
        return features
        
                                       
bag_of_words = BagOfWords()
bag_of_words.build_vocabulary(train_data)
train_features = bag_of_words.get_features(train_data)
test_features = bag_of_words.get_features(test_data)
train_features_norm, test_features_norm = normalize_data(train_features, test_features, 'l2')
print(train_features_norm)

[[0.35355339 0.35355339 0.35355339 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.19611613 0.19611613 0.        ]
 [0.         0.         0.         ... 0.         0.         0.33333333]
 [0.         0.         0.         ... 0.         0.         0.        ]]


## Ex6

In [20]:
svm_model = svm.SVC(C=1, kernel='linear')
svm_model.fit(train_features_norm, train_labels)
test_predictions = svm_model.predict(test_features_norm)

accuracy = accuracy_score(test_labels, test_predictions)
print(accuracy)
f1 = f1_score(test_labels, test_predictions)
print(f1)

coefficients = svm_model.coef_[0]
sorted_words = [word for _, word in sorted(zip(coefficients, bag_of_words.word_list))]


negative_words = sorted_words[:10]
print(negative_words)
positive_words = sorted_words[-10:]
print(positive_words)

0.9842391304347826
0.9409368635437881
['&lt#&gt', 'me', 'i', 'Going', 'him', 'Ok', 'I', 'Ill', 'my', 'Im']
['Text', 'To', 'mobile', 'CALL', 'FREE', 'txt', '&', 'Call', 'Txt', 'STOP']
