In [None]:
# jupyter-lab --NotebookApp.iopub_data_rate_limit=1e10
# --NotebookApp.iopub_data_rate_limit = 1e10

## **Overview**
---
>**A. Download GoogleNews-vectors-negative300.bin.gz pre-trained model**
>
>**B. Write a Python program, which:**
>>
>>**1.** Loads the downloaded pre-trained Google Word2Vec model
>>
>>**2.** Loads your previously obtained dataset of Webhose news articles
>>
>>**3.** For a selected article title from the dataset:
>>>  * Finds 100 most similar titles based on Word2Vec similarity, and
>>>  * Prints those titles in a descending order of similarity scores
>
>**C. Write a Pyspark program, which:**
>>
>>**1.** Loads your previously obtained dataset of Webhose news articles into a Spark dataframe
>>
>>**2.** Cleans + tokenizes article bodies using RegexTokenizer + Stopword remover functions
>>
>>**3.** Trains a Word2Vec model based on the output column produced in prior step
>>
>>**4.** Implements any sample search query and produces matching article titles

##**B. Write a Python Program, which:**
---

###*1. Loads the downloaded pre-trained Google Word2Vec model*

---

In [None]:
import json
import spacy

import numpy as np
import pandas as pd
import en_core_web_sm

In [None]:
import gensim, operator
from gensim.models import KeyedVectors, Word2Vec

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# loads in a model from specified path & file
model_path = '/content/drive/MyDrive/'

def load_model(model_name, file_name, flagBin):

    print('Loading ' + model_name + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + file_name, binary = flagBin)
    print('Finished loading ' + model_name + ' model...')
    
    return model
   
# load in Google word2vec model
model_w2v = load_model('Word2Vec', 'GoogleNews-vectors-negative300.bin.gz', True)

### **2. Loads your previously obtained dataset of Webhose news articles**
---

In [None]:
# webhose_json = pd.read_json('/content/drive/MyDrive/webhose_apple.json', lines = True)
# webhose_csv = webhose_json.to_csv('/content/webhose_apple.csv')

#newsfeeds = pd.read_csv('webhose_apple.csv')

In [None]:
apple_data = []

with open('/content/drive/MyDrive/webhose_apple.json', 'r') as f:
    for line in f.readlines():
        apple_data.append(json.loads(line))

apple_titles = [a['title'] for a in apple_data]

In [None]:
# subsetting feed titles
feeds = []
feeds.append(apple_titles[:2132])

### **3. For a selected article title from the dataset:**
---

#### *Define Vector Model Functions*
---

In [None]:
# check if input words in model vocabulary
def check_vocab(model, words):
    
    check_words = list()

    for word in words:
        if word in model.vocab:
            check_words.append(word.strip())
            
    return check_words

In [None]:
# calculate string similarity with model
def calc_sim(s1, s2, model):

    s1_terms = set(check_vocab(model, s1.split()))
    s2_terms = set(check_vocab(model, s2.split()))
    
    str_sim = model.similarity(s1_terms, s2_terms)

    return str_sim

In [None]:
# calculate vector similarity with particular model
def vec_sim(v1, v2, model):

    vec_terms = [np.zeros(300), np.zeros(300)]
    vectors = [v1, v2]
        
    for term, vec in enumerate(vectors):
        for t, v in enumerate(vec.split(' ')):
            try:
                vec_terms[term] += model[v]
            except:
                vec_terms[term] += 0
        
    vec_sim = (1 - spatial.distance.cosine(vec_terms[0], vec_terms[1]))

    if vec_sim is 'nan':
        vec_sim = 0
        
    return vec_sim

In [None]:
# find the cosine similarity of two vectors
def cos_sim(v1, v2):

    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / np.sqrt(np.dot(v2, v2)+.1)

#### **a. Find 100 most similar titles based on Word2Vec similarity**
---

In [None]:
if __name__ == '__main__':
  
  dex = 888
  apple_calc = []

  for title in apple_titles:

    s1 = str(apple_titles[dex])
    s2 = str(title)
    
    apple_sim = calc_sim(s1, s2, model_w2v)  
    apple_calc.append(apple_sim)

#### **b. Prints those titles in a descending order of similarity scores**
---

In [None]:
apple_dict = {'title':apple_titles, 'similarity':apple_calc}

apple_df = pd.DataFrame(apple_dict).\
  orderBy('similarity', ascending = False)

apple_df.show(100, truncate = False)

#**C. Write a Pyspark program, which:**
---

###**1. Loads your previously obtained dataset of Webhose news articles into a Spark dataframe**
---

In [None]:
!pip install pyspark

In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

sc = SparkContext() 
sqlContext = SQLContext(sc)

In [None]:
from scipy import spatial
from nltk.stem.wordnet import WordNetLemmatizer

from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec, Word2VecModel

In [None]:
article_df = sqlContext.read.option("header", "true").option("delimiter", ",") \
                    .option("inferSchema", "true") \
                    .json("/content/drive/MyDrive/webhose_apple.json")

In [None]:
article_data = article_df['index', 'title']

In [None]:
article_columns = [0,1]

article_rdd = article_data.select('*') \
                       .rdd.map(lambda row: [row[i] for i in article_columns]) \
                       .filter(lambda row: row[1] is not None)

article_df = sqlContext.createDataFrame(article_rdd, ['index', 'title'])

##**2. Cleans + tokenizes article bodies using RegexTokenizer + Stopword remover functions**
---

In [None]:
nlp = en_core_web_sm.load( disable=['parser', 'tagger','ner'] )

In [None]:
def cleanup_pretokenize(text):
    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'http\S+', '', text)
    text = text.replace("'s", " ")
    text = text.replace("n't", " not ")
    text = text.replace("'ve", " have ")
    text = text.replace("'re", " are ")
    text = text.replace("I'm"," I am ")
    text = text.replace("you're"," you are ")
    text = text.replace("You're"," You are ")
    text = text.replace("-"," ")
    text = text.replace("/"," ")
    text = text.replace("("," ")
    text = text.replace(")"," ")
    text = text.replace("%"," percent ")
    return text

In [None]:
lmtzr = WordNetLemmatizer()

In [None]:
def text_cleanup(row):
    desc = row[2].strip().lower()
    tokens = [w.lemma_ for w in nlp(cleanup_pretokenize(desc))]
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if len(token) > 3]
    #tokens = [lmtzr.lemmatize(token,'v') for token in tokens]
    row[2] = ' '.join(tokens)
    return row

In [None]:
regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'description', outputCol = 'tokens')
swr = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')

In [None]:
df_tokens = regexTokenizer.transform(article_df)
desc_swr = swr.transform(df_tokens)
desc_swr.show(3)

#desc_swr_half = desc_swr.limit(50000)
#desc_swr_half.show(3)
#desc_swr.write.saveAsTable('desc_swr', mode = 'overwrite')

##**3. Trains a Word2Vec model based on the output column produced in prior step**
---

In [None]:
word2vec = Word2Vec(vectorSize = 300, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')
model = word2vec.fit(desc_swr)
wordvectors = model.transform(desc_swr)
#wordvectors.select('wordvectors').show(1, truncate = True)
article_desc = wordvectors.select('index','title','wordvectors').rdd.toDF()
article_desc.show(5)

In [None]:
#chunk = article_desc.filter(lambda r: r[1]>=0 and r[1]<1000).collect()
#chunk = article_desc.collect()

chunk = article_desc.take(50000)

In [None]:
synonyms = model.findSynonyms("apple", 20)   
synonyms.show()

##**4. Implements any sample search query and produces matching article titles**
---

In [None]:
SEARCH_QUERY = "Apple Music"

In [None]:
query_df  = sc.parallelize([(1,SEARCH_QUERY)]).toDF(['index','description'])
query_tok = regexTokenizer.transform(query_df)
query_swr = swr.transform(query_tok)
query_swr.show()

query_vec = model.transform(query_swr)
query_vec = query_vec.select('wordvectors').collect()[0][0]
query_vec

In [None]:
sim_rdd = sc.parallelize((i[0], i[1], float(cossim(query_vec, i[2]))) for i in chunk)
sim_df  = sqlContext.createDataFrame(sim_rdd).\
                   withColumnRenamed('_1', 'index').\
                   withColumnRenamed('_2', 'title').\
                   withColumnRenamed('_3', 'similarity').\
                   orderBy("similarity", ascending = False)
                   
sim_df.show(42, truncate = False)