In [45]:
# import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils import shuffle
import time
import re
import random
import matplotlib.pyplot as plt
pd.set_option('max_colwidth', 300)
%run ssk.ipynb

In [46]:
spam = pd.read_csv('spam.csv', encoding = 'latin-1')
spam_data = pd.DataFrame(spam['v1'])
spam_data['text'] = spam.iloc[:, [1,2,3,4]].apply(lambda x: ' '.join(x.dropna()), axis=1)
spam_data = spam_data.rename(columns = {'v1': 'is_legit'})
spam_data['text'] = spam_data['text'].map(lambda x: re.sub(r'[^a-z]+', ' ', x.lower()).strip())
spam_data['is_legit'] = spam_data['is_legit'].replace('ham', 1)
spam_data['is_legit'] = spam_data['is_legit'].replace('spam', -1)

spam_data.head()

Unnamed: 0,is_legit,text
0,1,go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
1,1,ok lar joking wif u oni
2,-1,free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s
3,1,u dun say so early hor u c already then say
4,1,nah i don t think he goes to usf he lives around here though


In [48]:
rand_list = [random.randint(1,2000) for i in range(5)]
# rand_list = [1384, 60, 1913, 797, 802]

sd = shuffle(spam_data, random_state = 1384)[:200]
sms = np.array(sd.iloc[:,1])
label = np.array(sd.iloc[:,0])

In [49]:
def draw_aligment(n, decay, l, r, step):
    s = time.time()
    top = top_nsub(sms, n)
    num = np.arange(l,r,step)
    ker = GramMatrix(n, decay, sms)
    alig = []
    for i in num:
        top_i = top[:i]
        ker_aprox = Gram_aprox(n, decay, top_i, sms)
        alig.append(aligment(ker, ker_aprox))
    e = time.time()
    print('Time:', (e-s)/60, 'minutes')

    plt.figure()
    plt.plot(num, alig)
    plt.xlabel('Liczba najczęściej występujących podciągów długości {}'.format(n))
    plt.ylabel('Podobieństwo między jądrami')
    plt.title('Podobieństwo jądra do jego aproksymacji w zależności od liczby podciągów')
    plt.show()

In [None]:
draw_aligment(2, 0.5, 10, 420, 20)
draw_aligment(3, 0.5, 100, 500, 20)
draw_aligment(4, 0.5, 100, 550, 30)
draw_aligment(5, 0.5, 300, 600, 50)

In [55]:
sms = np.array(spam_data.iloc[:,1])
label = np.array(spam_data.iloc[:,0])

# długość podsekwencji łańchucha

for n in [2,3,4,5]:
    acc = []
    rec = []
    pre = []
    f1 = []
    for rs in rand_list:
        print('Random state:', rs)
        X_train, X_test, y_train, y_test = train_test_split(sms, label, test_size = 0.1, random_state = rs, stratify = label)
        top = top_nsub(X_train, n)[:500]
        sms_aprox_gram = Gram_aprox(n, 0.5, top, X_train)
        svc = SVC(kernel = 'precomputed', C = 10, gamma = 0.01)
        svc.fit(sms_aprox_gram, y_train)
        print('Training is done!')
        sms_test_gram = Gram_aprox(n, 0.5, top, X_test, X_train)
        y_pred = svc.predict(sms_test_gram)
        acc.append(accuracy_score(y_test, y_pred))
        rec.append(recall_score(y_test, y_pred))
        pre.append(precision_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
#         print('Accuracy score for test set: ', accuracy_score(y_test, y_pred), 'F1 score: ', f1_score(y_test, y_pred),
#               'Precision: ', precision_score(y_test, y_pred), 'Recall: ', recall_score(y_test, y_pred))

print('Accuracy score for test set: ', np.mean(acc), 'F1 score: ', np.mean(f1), 'precision:', np.mean(pre), np.mean(rec))
print('Accuracy score for test set: ', np.std(acc), 'F1 score: ', np.std(f1), 'precision:', np.std(pre), np.std(rec))

In [53]:
# tf-idf

acc = []
f1 = []
pre = []
rec = []

for rs in rand_list:
    tf=TfidfVectorizer()
    X_train, X_test, y_train, y_test = train_test_split(sms, label, test_size=0.1, random_state=rs, stratify = label)

    X_train= tf.fit_transform(X_train)
    X_test = tf.transform(X_test)

    svc = SVC(C=10, gamma=0.01)
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    acc.append(accuracy_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred))
    pre.append(precision_score(y_test, y_pred))
    rec.append(recall_score(y_test, y_pred))
    
print('Accuracy score for test set: ', np.mean(acc), 'F1 score: ', np.mean(f1), 'precision:', np.mean(pre),
      'recall: ', np.mean(rec))
print('Accuracy score for test set: ', np.std(acc), 'F1 score: ', np.std(f1), 'precision:', np.std(pre),
      'recall: ', np.std(rec))

Accuracy score for test set:  0.9756272401433691 F1 score:  0.9861076424750281 precision: 0.9741936547203425 recall:  0.998343685300207
Accuracy score for test set:  0.007041893442433164 F1 score:  0.003960888460207722 precision: 0.0072739406195639535 recall:  0.0015493405328256436


In [56]:
# wartość lambda
for decay in [0.01, 0.03, 0.05, 0.07, 0.1, 0.3, 0.5, 0.7, 0.9]:
    acc = []
    rec = []
    pre = []
    f1 = []
    for rs in rand_list:
        print('Random state:', rs)
        X_train, X_test, y_train, y_test = train_test_split(sms, label, test_size = 0.1, random_state = rs, stratify = label)
        top = top_nsub(X_train, 2)[:500]
        sms_aprox_gram = Gram_aprox(2, decay, top, X_train)
        svc = SVC(kernel = 'precomputed', C = 10, gamma = 0.01)
        svc.fit(sms_aprox_gram, y_train)
        print('Training is done!')
        sms_test_gram = Gram_aprox(2, decay, top, X_test, X_train)
        y_pred = svc.predict(sms_test_gram)
        acc.append(accuracy_score(y_test, y_pred))
        rec.append(recall_score(y_test, y_pred))
        pre.append(precision_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
#         print('Accuracy score for test set: ', accuracy_score(y_test, y_pred), 'F1 score: ', f1_score(y_test, y_pred),
#               'Precision: ', precision_score(y_test, y_pred), 'Recall: ', recall_score(y_test, y_pred))

print('Accuracy score for test set: ', np.mean(acc), 'F1 score: ', np.mean(f1), 'precision:', np.mean(pre), np.mean(rec))
print('Accuracy score for test set: ', np.std(acc), 'F1 score: ', np.std(f1), 'precision:', np.std(pre), np.std(rec))