In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession, SQLContext
spark = SparkSession.builder.appName("spark-nlp") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5")\
    .getOrCreate()

In [2]:
sc = spark.sparkContext
spark

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('words')

import nltk
# get the list of stopwords from nltk
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('xxxx')
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary.
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        finisher
    ])

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Package words is already up-to-date!


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [251]:
wiki = sc.wholeTextFiles("s3a://zihe-public/articles/*")

In [252]:
import re

def get_title(x):
    pattern = "\<doc\sid\=\"(\d+)\"(.*)title\=\"(.*)\"\>"
    pattern_re = re.compile(pattern)
    matches = pattern_re.search(x)
    if not matches:
        return ""
    return matches[3]

def get_content(x):
    pattern = "\<doc\sid\=\"(\d+)\"(.*)title\=\"(.*)\"\>\\n(.*?)\\n{2}"
    pattern_re = re.compile(pattern)
    matches = pattern_re.search(x)
    if not matches:
        return ""
    idx = matches.end(0)
    return x[idx:]

def get_id(x):
    pattern = "\<doc\sid\=\"(\d+)\"(.*)title\=\"(.*)\"\>"
    pattern_re = re.compile(pattern)
    matches = pattern_re.search(x)
    if not matches:
        return ""
    return matches[1]

In [253]:
text = wiki.flatMap(lambda x: (x[1].split('</doc>'))).map(lambda x : (get_id(x), get_title(x), get_content(x)))
text = text.toDF(["id","title","text"])
text.show()

+---+--------------------+--------------------+
| id|               title|                text|
+---+--------------------+--------------------+
| 12|           Anarchism|Anarchism is a po...|
| 25|              Autism|Autism is a devel...|
| 39|              Albedo|Albedo () (, mean...|
|290|                   A|A or a is the fir...|
|303|             Alabama|Alabama () is a s...|
|305|            Achilles|In Greek mytholog...|
|307|     Abraham Lincoln|Abraham Lincoln (...|
|308|           Aristotle|Aristotle (; "Ari...|
|309|An American in Paris|An American in Pa...|
|316|Academy Award for...|The Academy Award...|
|324|      Academy Awards|The Academy Award...|
|330|             Actrius|Actresses (Catala...|
|332|     Animalia (book)|Animalia is an il...|
|334|International Ato...|International Ato...|
|336|            Altruism|Altruism is the p...|
|339|            Ayn Rand|Ayn Rand (; born ...|
|340|        Alain Connes|Alain Connes (; b...|
|344|          Allan Dwan|Allan Dwan (3 

In [254]:
cleaned = pipeline.fit(text).transform(text)
cleaned.show()

+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|               title|                text|            document|               token|          normalized|               lemma|         clean_lemma|finished_clean_lemma|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| 12|           Anarchism|Anarchism is a po...|[[document, 0, 34...|[[token, 0, 8, An...|[[token, 0, 8, an...|[[token, 0, 8, an...|[[token, 0, 8, an...|[anarchism, polit...|
| 25|              Autism|Autism is a devel...|[[document, 0, 43...|[[token, 0, 5, Au...|[[token, 0, 5, au...|[[token, 0, 5, au...|[[token, 0, 5, au...|[autism, developm...|
| 39|              Albedo|Albedo () (, mean...|[[document, 0, 17...|[[token, 0, 5, Al...|[[token, 0, 5, al...|[[token, 0, 5, al...

In [255]:
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
from pyspark.mllib.feature import Normalizer
from pyspark.ml.linalg import Vectors

In [256]:
documents = cleaned.select('finished_clean_lemma').rdd.flatMap(list)
documentId = cleaned.select('id').rdd.flatMap(list)

hashingTF = HashingTF(100000)
tf = hashingTF.transform(documents)

In [None]:
tf.cache()
idf = IDF(minDocFreq=1).fit(tf)

In [157]:
tfidf = idf.transform(tf)

In [217]:
normalizer = Normalizer()
normalizedtfidf=normalizer.transform(tfidf)

In [223]:
keyword = "mathematics"

In [224]:
keywordTF = hashingTF.transform([keyword.lower()])
keywordHashValue = int(keywordTF.indices[0])

In [225]:
keywordRelevance = normalizedtfidf.map(lambda x: x[keywordHashValue])
zippedResults = keywordRelevance.zip(documentId)

In [226]:
keywordRelevance.top(10)

[0.010993429132272574,
 0.005807206658798934,
 0.005286009919697647,
 0.004883073523233428,
 0.003461760271393371,
 0.003270489891544955,
 0.003161939544531214,
 0.003029951350310274,
 0.002756716441453091,
 0.0026975349989878774]

In [244]:
print(zippedResults.max())

(0.010993429132272574, '334')


In [250]:
cleaned.createOrReplaceTempView("table_df")
out = sqlContext.sql("""SELECT title FROM table_df WHERE id == 334""")
out.show(truncate = False)

+-------------------------+
|title                    |
+-------------------------+
|International Atomic Time|
+-------------------------+

