# Importuri necesare

In [1]:
import re
import os
import numpy as np
from sklearn import svm

# Descărcarea datelor

In [2]:
! rm -rf spam*
! wget https://github.com/artificial-intelligence-ml-cti/ml_cti/raw/main/laborator5/spam_dataset.zip
! unzip "spam_dataset.zip"

! echo "***\n Fisierele sunt: "
! ls spam_dataset/
! echo "****\n Calea catre directorul cu date este: "
! readlink -f spam_dataset/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: spam_dataset/data/0178.txt  
  inflating: spam_dataset/data/0179.txt  
  inflating: spam_dataset/data/0180.txt  
  inflating: spam_dataset/data/0181.txt  
  inflating: spam_dataset/data/0182.txt  
  inflating: spam_dataset/data/0183.txt  
  inflating: spam_dataset/data/0184.txt  
  inflating: spam_dataset/data/0185.txt  
  inflating: spam_dataset/data/0186.txt  
  inflating: spam_dataset/data/0187.txt  
  inflating: spam_dataset/data/0188.txt  
  inflating: spam_dataset/data/0189.txt  
  inflating: spam_dataset/data/0190.txt  
  inflating: spam_dataset/data/0191.txt  
  inflating: spam_dataset/data/0192.txt  
  inflating: spam_dataset/data/0193.txt  
  inflating: spam_dataset/data/0194.txt  
  inflating: spam_dataset/data/0195.txt  
  inflating: spam_dataset/data/0196.txt  
  inflating: spam_dataset/data/0197.txt  
  inflating: spam_dataset/data/0198.txt  
  inflating: spam_dataset/data/0199.txt  
  inflating

In [3]:
data_path = "spam_dataset"

# Citirea datelor

In [4]:
def files_in_folder(path):
    """Functie care returneaza toate 
    fisierele dintr-un director.
    """
    lista_fisiere = []
    for fisier in os.listdir(path):
        cale_absoluta = os.path.join(path, fisier)
        if os.path.isfile(cale_absoluta):
            lista_fisiere.append(cale_absoluta)
    # sortam lista ca sa ne asiguram ca ordinea 
    # in care apar nu este una arbitrara
    lista_fisiere.sort()
    return lista_fisiere


In [5]:
# citim etichetele
labels = np.loadtxt(os.path.join(data_path, 'labels.txt'), 'int8')
print(labels)

[0 0 0 ... 1 1 0]


In [6]:
# citim textele
texte = []
folder_path = os.path.join(data_path, 'data')
for fisier in files_in_folder(folder_path):
    with open(fisier, 'r', encoding='utf-8') as fin:
        text = fin.read()
    texte.append(text)

print(texte[1])

Subject: vastar resources , inc .
gary , production from the high island larger block a - 1 # 2 commenced on
saturday at 2 : 00 p . m . at about 6 , 500 gross . carlos expects between 9 , 500 and
10 , 000 gross for tomorrow . vastar owns 68 % of the gross production .
george x 3 - 6992
- - - - - - - - - - - - - - - - - - - - - - forwarded by george weissman / hou / ect on 12 / 13 / 99 10 : 16
am - - - - - - - - - - - - - - - - - - - - - - - - - - -
daren j farmer
12 / 10 / 99 10 : 38 am
to : carlos j rodriguez / hou / ect @ ect
cc : george weissman / hou / ect @ ect , melissa graves / hou / ect @ ect
subject : vastar resources , inc .
carlos ,
please call linda and get everything set up .
i ' m going to estimate 4 , 500 coming up tomorrow , with a 2 , 000 increase each
following day based on my conversations with bill fischer at bmar .
d .
- - - - - - - - - - - - - - - - - - - - - - forwarded by daren j farmer / hou / ect on 12 / 10 / 99 10 : 34
am - - - - - - - - - - - - - - - - - - -

# Pre-procesarea datelor

- extragem informațiile necesare din text
- eliminăm semnele de punctuație
- facem tokenizare (împărțire în cuvinte)


In [7]:
def proceseaza(text):
    """Functie simpla de procesare a textului.
    Sugestii:
    - prima linie dintr-un text contine "Subject:"
    - cum puteti folosi subject?
    - cum procesati \n new lines? (vezi functia strip())
    - cum procesati empty token ''
    - puteti introduce un tokenizator din nltk
    - puteti elimina stop-words
    """
    text = re.sub("[-.,;:!?\"\'\/()_*=`]", "", text)
    text_in_cuvinte = text.split(' ')
    return text_in_cuvinte

# cuvintele rezultate din functia de preprocesare:
proceseaza(texte[4])

['Subject',
 'meter',
 '7268',
 'nov',
 'allocation\nfyi',
 '\n',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'forwarded',
 'by',
 'lauri',
 'a',
 'allen',
 '',
 'hou',
 '',
 'ect',
 'on',
 '12',
 '',
 '14',
 '',
 '99',
 '12',
 '',
 '17\npm',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '\nkimberly',
 'vaughn\n12',
 '',
 '10',
 '',
 '99',
 '02',
 '',
 '54',
 'pm\nto',
 '',
 'lauri',
 'a',
 'allen',
 '',
 'hou',
 '',
 'ect',
 '@',
 'ect\ncc',
 '',
 'mary',
 'm',
 'smith',
 '',
 'hou',
 '',
 'ect',
 '@',
 'ect\nsubject',
 '',
 'meter',
 '7268',
 'nov',
 'allocation\nlauri',
 '',
 '',
 'i',
 'have',
 'put',
 'this',
 'on',
 'strangas',
 'gas',
 'until',
 'i',
 'can',
 'get',
 'a',
 'contract',
 'from\ndaren',
 '\n',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'forwarded',
 'by',
 'k

### Aplicam functia de preprocesarea intregului set de date

In [8]:
data = []
for text in texte:
    data.append(proceseaza(text))

print(data[0])

['Subject', 'christmas', 'tree', 'farm', 'pictures\n']


# Împărțirea datelor în train, validare și test

In [9]:
print(len(data))

# putem imparti datele de antrenare astfel:
# 20% date de test din total 
# 15% date de validare din ce ramane dupa ce scoatem datele de test 

nr_test = int(20/100 * len(data))
print("Nr de date de test: ", nr_test)

nr_ramase = len(data) - nr_test
nr_valid = int(15/100 * nr_ramase)
print("Nr de date de validare: ", nr_valid)

nr_train = nr_ramase - nr_valid
print("Nr de date de antrenare: ", nr_train)



5172
Nr de date de test:  1034
Nr de date de validare:  620
Nr de date de antrenare:  3518


In [10]:
# facem impartirea in ordinea in care apar datele

train_data = data[:nr_train]
train_labels = labels[:nr_train]

valid_data = data[nr_train : nr_train + nr_valid]
valid_labels = labels[nr_train : nr_train + nr_valid]

test_data = data[nr_train + nr_valid: ]
test_labels = labels[nr_train + nr_valid:]


print(len(train_labels))
print(len(valid_labels))
print(len(test_labels))

3518
620
1034


# Bag of Words

În cadrul acestei secțiuni vom face numărarea aparițiilor tuturor cuvintelor din datele noastre. Pentru o evaluare justă nu ar fi indicat să includem si cuvintele din datele de test.

In [11]:
from collections import Counter

# vom folosi o structura de date nativa din python
# care functioneaza ca un dictionar care numara elementele hashable
# dintr-o colectie
ctr = Counter(['eggs', 'ham', 'eggs', 'egg'])
print(ctr['bacon'])
print(ctr['eggs'])

ctr = Counter([(0,1), (0,0), (0,1)])
print(ctr[(0,0)])
print(ctr[(2,2)])

# intrebare: lista este hashable?

0
2
1
0


In [12]:
ctr = Counter(data[5])
print(ctr.keys())
print(ctr.values())
# care este cel mai frecvent cuvant in data[5]?

dict_keys(['Subject', 'dobmeos', 'with', 'hgh', 'my', 'energy', 'level', 'has', 'gone', 'up', '', 'stukm\nintroducing\ndoctor', 'formulated\nhgh\nhuman', 'growth', 'hormone', 'also', 'called', 'hgh\nis', 'referred', 'to', 'in', 'medical', 'science', 'as', 'the', 'master', 'it', 'is', 'very', 'plentiful\nwhen', 'we', 'are', 'young', 'but', 'near', 'age', 'of', 'twenty', 'one', 'our', 'bodies', 'begin', 'produce\nless', 'by', 'time', 'forty', 'nearly', 'everyone', 'deficient', '\nand', 'at', 'eighty', 'production', 'normally', 'diminished', 'least', '90', '95', '%', '\nadvantages', '\n', 'increased', 'muscle', 'strength\n', 'loss', 'body', 'fat\n', 'bone', 'density\n', 'lower', 'blood', 'pressure\n', 'quickens', 'wound', 'healing\n', 'reduces', 'cellulite\n', 'improved', 'vision\n', 'wrinkle', 'disappearance\n', 'skin', 'thickness', 'texture\n', 'levels\n', 'sleep', 'and', 'emotional', 'stability\n', 'memory', 'mental', 'alertness\n', 'sexual', 'potency\n', 'resistance', 'common', 'illne

### Frecventa cuvintelor din setul de antrenare

In [13]:
counter = Counter()
for text_preprocesat in train_data:
    counter.update(text_preprocesat)

print(counter.most_common(10))


[('', 174228), ('the', 13644), ('ect', 10192), ('to', 10077), ('@', 9047), ('and', 6769), ('hou', 6277), ('for', 6240), ('of', 5558), ('a', 5530)]


### Reprezentarea datelor sub forma vectoriala

- sa presupunem ca folosim primele N cuvinte non-nule ca caracteristici pentru fiecare text


In [14]:
N = 10
cuvinte_caracteristice = []
for cuvant, frecventa in counter.most_common(N):
    if cuvant.strip():
        cuvinte_caracteristice.append(cuvant)
print(cuvinte_caracteristice)

['the', 'ect', 'to', '@', 'and', 'hou', 'for', 'of', 'a']


- fiecarui cuvant îi atribuim un id în funcție de poziția pe care se află
- ordinea in care sunt stocate cheile intr-un dictionar este arbitrara 
- iar o lista este un obiect mutabil in care ordinea elementelor se poate schimba oricand
- cel mai sigur este sa construim o mapare intre cuvinte si un id care sa reprezinte pozitia in vectorul de caracteristici


In [15]:
word2id = {}
id2word = {}
for idx, cuv in enumerate(cuvinte_caracteristice):
    word2id[cuv] = idx
    id2word[idx] = cuv

print(word2id)
print(id2word)

{'the': 0, 'ect': 1, 'to': 2, '@': 3, 'and': 4, 'hou': 5, 'for': 6, 'of': 7, 'a': 8}
{0: 'the', 1: 'ect', 2: 'to', 3: '@', 4: 'and', 5: 'hou', 6: 'for', 7: 'of', 8: 'a'}


- cand trebuie sa reprezentam un text sub forma vectoriala, ne raportam doar la cuvintele cheie pe care le folosim ca caracteristici
- id-ul reprezinta pozitia in vector unde vom stoca aparitiile fiecarui cuvant

In [16]:
# 1. numaram toate cuvintele din text
ctr = Counter(train_data[1])

# 2. prealocam un array care va reprezenta caracteristicel noastre
features = np.zeros(len(cuvinte_caracteristice))

# 3. umplem array-ul cu valorile obtinute din counter
# fiecare pozitie din array trebuie sa reprezinte frecventa
# aceluiasi cuvant in toate textele
for idx in range(0, len(features)):
    # obtinem cuvantul pentru pozitia idx
    cuvant = id2word[idx]
    # asignam valoarea corespunzatoare frecventei cuvantului
    features[idx] = ctr[cuvant]

print(features)
print([id2word[idx] for idx in range(0, len(features))])

[ 5. 12.  4.  7.  3. 10.  3.  2.  6.]
['the', 'ect', 'to', '@', 'and', 'hou', 'for', 'of', 'a']


### Punem totul cap la cap sub forma de functii

In [17]:
def count_most_common(how_many, texte_preprocesate):
    """Functie care returneaza cele mai frecvente cuvinte.
    """
    counter = Counter()
    for text_preprocesat in texte_preprocesate:
        counter.update(text_preprocesat)
    cuvinte_caracteristice = []
    for cuv, _ in counter.most_common(how_many):
        if cuv.strip():
            cuvinte_caracteristice.append(cuv)
    return cuvinte_caracteristice

In [18]:
def build_id_word_dicts(cuvinte_caracteristice):
    '''Dictionarele word2id si id2word garanteaza o ordine
    pentru cuvintele caracteristice.
    '''
    word2id = {}
    id2word = {}
    for idx, cuv in enumerate(cuvinte_caracteristice):
        word2id[cuv] = idx
        id2word[idx] = cuv
    return word2id, id2word

In [19]:
def featurize(text_preprocesat, id2word):
    """Pentru un text preprocesat dat si un dictionar
    care mapeaza pentru fiecare pozitie ce cuvant corespunde,
    returneaza un vector care reprezinta
    frecventele fiecarui cuvant.
    """
    ctr = Counter(text_preprocesat)
    features = np.zeros(len(id2word))
    for idx in range(0, len(features)):
        cuvant = id2word[idx]
        features[idx] = ctr[cuvant]
    return features

In [20]:
def featurize_multi(texte, id2word):
    '''Pentru un set de texte preprocesate si un dictionar
    care mapeaza pentru fiecare pozitie ce cuvant corespunde,
    returneaza matricea trasaturilor tuturor textelor.
    '''
    all_features = []
    for text in texte:
        all_features.append(featurize(text, id2word))
    return np.array(all_features)

In [21]:
cuvinte_caracteristice = count_most_common(1000, train_data)
print(len(cuvinte_caracteristice))
word2id, id2word = build_id_word_dicts(cuvinte_caracteristice)

X_train = featurize_multi(train_data, id2word)
X_valid = featurize_multi(valid_data, id2word)
X_test = featurize_multi(test_data, id2word)

print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)


998
(3518, 998)
(620, 998)
(1034, 998)


In [22]:
from sklearn.metrics import accuracy_score

model = svm.LinearSVC(C=2)
# observati convergence warning
model.fit(X_train, train_labels)
vpreds = model.predict(X_valid)
tpreds = model.predict(X_test)

print(accuracy_score(valid_labels, vpreds))
print(accuracy_score(test_labels, tpreds))

0.9403225806451613
0.9158607350096711




In [23]:
# cu normalizare creste si accuratetea
# nu mai avem convergence warning
X_train_nrm = X_train / np.sqrt(np.sum(X_train**2, axis=1)).reshape(-1, 1)
X_valid_nrm = X_valid / np.sqrt(np.sum(X_valid**2, axis=1)).reshape(-1, 1)
X_test_nrm = X_test / np.sqrt(np.sum(X_test**2, axis=1)).reshape(-1, 1)

model.fit(X_train_nrm, train_labels)
vpreds = model.predict(X_valid_nrm)
tpreds = model.predict(X_test_nrm)

print(accuracy_score(valid_labels, vpreds))
print(accuracy_score(test_labels, tpreds))

0.9596774193548387
0.9342359767891683


In [24]:
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
print(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'bo