### Read Astrailian News Titles:

In [34]:
import numpy as np
import spacy
import os.path
import os
path = os.path.join(os.getcwd(), "/Users/Ben/Downloads/abcnews-date-text.csv")

#Extract titles
data = open(path,"r").read()
lines = data.split("\n")
titles = []
record = 0
for i,line in enumerate(lines):
    if(i > 0 and i < 10000):
        title = line.split(",")[1]
        titles.append(title)

# 1) Vectorize:

In [35]:
nlp = spacy.load('en_core_web_lg')

def vectorize(weighted=False):
    #add multiprocessing?
    freq_dict = {}

    
    docs = nlp.pipe(titles)

    vectors = []
    doc_tokens = []
    num_words = 0
    for doc in docs:
        
        #vectorization
        if(weighted):
            doc_tokens.append([token for token in doc]) #split into tokens for later weighted avging
            
            #making freq_dict
            for token in doc:
                if token.text in freq_dict:
                    freq_dict[token.text] += 1
                else:
                    freq_dict[token.text] = 1
                num_words += 1
        else:
            vectors.append(doc.vector)
    
    
    #calc freqs for freq_dict
    if weighted:
        for line_of_tokens in doc_tokens:
            line_of_token_vectors = np.asarray([token.vector for token in line_of_tokens])
            weights = [num_words/freq_dict[token.text] for token in line_of_tokens]
            vector = np.average(line_of_token_vectors,weights=weights,axis=0)
            vectors.append(vector)
        
    return vectors

In [36]:
from scipy.spatial import KDTree

vectors = vectorize(False) #True for weighted!
len(vectors)

9999

### Nearest Neighbors:

In [37]:
tree = KDTree(vectors)

def findKNN(text,k=10):
    dists, idxs = tree.query(nlp(text).vector,k=k)
    for i in range(k):
        print(titles[idxs[i]], ": ", dists[i])

In [38]:
findKNN("monster surf sweeps woman to death")

monster surf sweeps woman to death :  0.0
woman in hospital after surf ordeal :  2.2981802417131862
ten man river crash to emelec in trouble hit match :  2.3359068801557115
man to face court over fatal hit and run :  2.3728053342866002
savage gets death threats after dublin head butt :  2.3834656867599664
boy almost drowns after being swept down storm :  2.4047534797247496
gold coast boy continue to fight for life after :  2.412719550544082
man bleeds to death after shop burglary :  2.4320065142644673
serena overcomes slow start to swamp birthday girl :  2.461841138813617
virus death toll rises as world waits on news from :  2.466015810628854


# 2) Find Clusters using Unsupervised Learning:

In [39]:
from sklearn.cluster import KMeans, AgglomerativeClustering
classifier = AgglomerativeClustering(n_clusters=5).fit(vectors) #KMeans().fit(vectors) #which clustering_model

# 3) Save Model for TSNE in Embedding Projector:

In [40]:
def save_vectors(vectors):
    with open(os.path.join(os.getcwd(), "/Users/Ben/Desktop/Vital Strategies/nlpVectors/nlp_tensors_unweighted.tsv"), "w") as f:
        for vector in vectors:
            for feature in vector:
                f.write(str(feature))
                f.write("\t")
            f.write("\n")
            
    f.close()
def save_metadata(titles):
    with open(os.path.join(os.getcwd(), "/Users/Ben/Desktop/Vital Strategies/nlpVectors/nlp_metadata_unweighted.tsv"), "w") as f:
        f.write("title\tcategory\n")
        for i, title in enumerate(titles):
            f.write(title)
            f.write("\t")
            f.write(str(classifier.labels_[i]))#.predict([vectors[i]])[0]))
            f.write("\n")
    f.close()

In [41]:
save_vectors(vectors)
save_metadata(titles)

#https://projector.tensorflow.org/

# 4) Train a Classifier for Clusters:

In [42]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import train

In [43]:
model = tf.keras.Sequential([
    layers.Dense(500, activation='relu', input_shape=(300,)),
    layers.Dense(50, activation='relu'),
    layers.Dense(5, activation='softmax')])

model.compile(optimizer=train.AdamOptimizer(.001), loss='categorical_crossentropy', metrics=['accuracy'])

#get onehot encoding:
from keras.utils import to_categorical
npvectors = np.asarray(vectors)
onehot = to_categorical(classifier.labels_)

model.fit(npvectors[:-2000],onehot[:-2000],epochs=8,batch_size=30) #leave 200 for testing

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x6a0e83630>

### Evaluate Classifier:

In [44]:
model.evaluate(npvectors[-200:],onehot[-200:],batch_size=30)



[0.5938290446996689, 0.845]

# 5) Prediction:

In [45]:
#from scipy.spatial.distance import cosine

def predict_title(title):
    title_vector = np.asarray([nlp(title).vector]) 
    temp = model.predict(title_vector)[0]
    
    for i, item in enumerate(temp):
        print(i, ": ", item)
    highest_idx = np.argmax(temp)
    
    if(temp[highest_idx] <.5):
        print("\nUNCERTAINTY FLAG: model is < 50% confident!")

In [46]:
predict_title("First pig to land on moon") #"Romeo hacker eats"

0 :  0.06803368
1 :  0.0003404365
2 :  7.228554e-07
3 :  0.9316244
4 :  7.908757e-07
