In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
doc = nlp('dog cat banana jkhaskqwhddh')

In [7]:
for token in doc:
    print(token.text,'has vector: ',token.has_vector,' is out of voc: ',token.is_oov)

dog has vector:  True  is out of voc:  False
cat has vector:  True  is out of voc:  False
banana has vector:  True  is out of voc:  False
jkhaskqwhddh has vector:  False  is out of voc:  True


In [8]:
doc[0].vector

array([ 1.2330e+00,  4.2963e+00, -7.9738e+00, -1.0121e+01,  1.8207e+00,
        1.4098e+00, -4.5180e+00, -5.2261e+00, -2.9157e-01,  9.5234e-01,
        6.9880e+00,  5.0637e+00, -5.5726e-03,  3.3395e+00,  6.4596e+00,
       -6.3742e+00,  3.9045e-02, -3.9855e+00,  1.2085e+00, -1.3186e+00,
       -4.8886e+00,  3.7066e+00, -2.8281e+00, -3.5447e+00,  7.6888e-01,
        1.5016e+00, -4.3632e+00,  8.6480e+00, -5.9286e+00, -1.3055e+00,
        8.3870e-01,  9.0137e-01, -1.7843e+00, -1.0148e+00,  2.7300e+00,
       -6.9039e+00,  8.0413e-01,  7.4880e+00,  6.1078e+00, -4.2130e+00,
       -1.5384e-01, -5.4995e+00,  1.0896e+01,  3.9278e+00, -1.3601e-01,
        7.7732e-02,  3.2218e+00, -5.8777e+00,  6.1359e-01, -2.4287e+00,
        6.2820e+00,  1.3461e+01,  4.3236e+00,  2.4266e+00, -2.6512e+00,
        1.1577e+00,  5.0848e+00, -1.7058e+00,  3.3824e+00,  3.2850e+00,
        1.0969e+00, -8.3711e+00, -1.5554e+00,  2.0296e+00, -2.6796e+00,
       -6.9195e+00, -2.3386e+00, -1.9916e+00, -3.0450e+00,  2.48

In [9]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x24df070ccb0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x24df01336b0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x24df17865e0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x24df1a961d0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x24df1a81e90>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x24df1786420>)]

In [15]:
base_token = nlp('iphone')
base_token.vector.shape

(300,)

In [16]:
doc = nlp("apple samsung iphone dog kitten")

for token in doc:
    print(f"{token.text} <-> {base_token.text}: {base_token.similarity(token)}")

apple <-> iphone: 0.4387907401919904
samsung <-> iphone: 0.670859081425417
iphone <-> iphone: 1.0
dog <-> iphone: 0.08211864228011527
kitten <-> iphone: 0.10222317834969896


In [17]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
women = nlp.vocab['women'].vector
queen = nlp.vocab['queen'].vector

In [18]:
result = king - man + women

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result],[queen])

array([[0.36506233]], dtype=float32)

In [29]:
import pandas as pd

In [30]:
df = pd.read_csv('spam.csv')

In [33]:
df.head()

Unnamed: 0,title,text,type
0,?? the secrets to SUCCESS,"Hi James,\n\nHave you claim your complimentary...",spam
1,?? You Earned 500 GCLoot Points,"\nalt_text\nCongratulations, you just earned\n...",not spam
2,?? Your GitHub launch code,"Here's your GitHub launch code, @Mortyj420!\n ...",not spam
3,[The Virtual Reward Center] Re: ** Clarifications,"Hello,\n \nThank you for contacting the Virtua...",not spam
4,"10-1 MLB Expert Inside, Plus Everything You Ne...","Hey Prachanda Rawal,\n\nToday's newsletter is ...",spam


In [34]:
df['label'] = df['type'].apply(lambda x:1 if x=='spam' else 0)

In [35]:
df.head()

Unnamed: 0,title,text,type,label
0,?? the secrets to SUCCESS,"Hi James,\n\nHave you claim your complimentary...",spam,1
1,?? You Earned 500 GCLoot Points,"\nalt_text\nCongratulations, you just earned\n...",not spam,0
2,?? Your GitHub launch code,"Here's your GitHub launch code, @Mortyj420!\n ...",not spam,0
3,[The Virtual Reward Center] Re: ** Clarifications,"Hello,\n \nThank you for contacting the Virtua...",not spam,0
4,"10-1 MLB Expert Inside, Plus Everything You Ne...","Hey Prachanda Rawal,\n\nToday's newsletter is ...",spam,1


In [36]:
def preprocess(text):
    doc = nlp(text)
    return doc.vector

In [37]:
df['X'] = df['title'].apply(lambda x:preprocess(x))

In [38]:
df.head()

Unnamed: 0,title,text,type,label,X
0,?? the secrets to SUCCESS,"Hi James,\n\nHave you claim your complimentary...",spam,1,"[-1.6986866, 5.0900383, -4.3180346, -1.4204868..."
1,?? You Earned 500 GCLoot Points,"\nalt_text\nCongratulations, you just earned\n...",not spam,0,"[-2.5418572, 1.6458598, -3.8146198, 0.7142529,..."
2,?? Your GitHub launch code,"Here's your GitHub launch code, @Mortyj420!\n ...",not spam,0,"[0.56561995, 3.1089783, -1.3254766, -0.6998732..."
3,[The Virtual Reward Center] Re: ** Clarifications,"Hello,\n \nThank you for contacting the Virtua...",not spam,0,"[2.4618053, -2.2533705, 6.042959, 0.30375445, ..."
4,"10-1 MLB Expert Inside, Plus Everything You Ne...","Hey Prachanda Rawal,\n\nToday's newsletter is ...",spam,1,"[-0.13889572, -0.357203, -1.3842368, 0.945941,..."


In [39]:
from sklearn.model_selection import train_test_split

In [42]:
xtrain,xtest,ytrain,ytest = train_test_split(
    df.X.values,
    df.label,test_size=0.2
)

In [44]:
import numpy as np

In [49]:
X_train_2d = np.stack(xtrain)
X_test_2d = np.stack(xtest)

In [51]:
X_train_2d,X_test_2d

(array([[-4.9303498e+00,  3.7732503e-01, -1.6070350e+00, ...,
         -1.0944250e+00, -4.0731249e+00,  8.6752498e-01],
        [ 6.8793637e-01, -4.1756999e-01,  1.0442246e+00, ...,
         -3.0830362e+00, -1.9543037e+00, -1.1001934e-04],
        [-2.3282397e-01,  1.0483342e+00,  4.1366506e-01, ...,
         -1.3027675e+00, -2.4075096e+00,  1.0250223e+00],
        ...,
        [ 2.9677749e-01, -6.1612499e-01,  3.1289277e+00, ...,
         -1.7276651e+00, -1.0776651e+00,  6.4414245e-01],
        [-2.4506450e-04,  5.3079373e-01, -7.6218748e-01, ...,
          4.6056294e-01, -1.0525501e+00,  9.6603978e-01],
        [-4.9740000e+00, -3.6452000e+00,  5.3850999e+00, ...,
         -3.3347001e+00, -8.9314997e-01,  4.0409002e+00]], dtype=float32),
 array([[ 2.4618053 , -2.2533705 ,  6.042959  , ..., -2.9115152 ,
          0.69406277, -0.17820267],
        [-0.33311853, -1.2371774 , -2.2353375 , ..., -1.7281927 ,
         -3.6296    ,  0.10960624],
        [-1.4208274 , -0.20165005, -3.76466   

In [52]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

In [54]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_2d)
X_test = scaler.fit_transform(X_test_2d)

In [55]:
clf = MultinomialNB()
clf.fit(X_train,ytrain)

In [57]:
y_pred = clf.predict(X_test)

In [58]:
from sklearn.metrics import classification_report

In [60]:
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.67      0.73      0.70        11
           1       0.40      0.33      0.36         6

    accuracy                           0.59        17
   macro avg       0.53      0.53      0.53        17
weighted avg       0.57      0.59      0.58        17



In [61]:
from sklearn.neighbors import KNeighborsClassifier

In [72]:
clf = KNeighborsClassifier(n_neighbors=3,metric='euclidean')

In [73]:
clf.fit(X_train,ytrain)

In [74]:
y_pred = clf.predict(X_test)

In [75]:
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.65      1.00      0.79        11
           1       0.00      0.00      0.00         6

    accuracy                           0.65        17
   macro avg       0.32      0.50      0.39        17
weighted avg       0.42      0.65      0.51        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
