### read as rdd

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("502hw4").getOrCreate()

In [7]:
data = spark.sparkContext.textFile("s3a://zihe-public/articles/*")

In [28]:
top100 = data.top(100)

### Read as dataframe

In [9]:
df = spark.read.text("s3a://zihe-public/articles/*")

In [13]:
df.first()

Row(value='<doc id="1105982" url="https://en.wikipedia.org/wiki?curid=1105982" title="Electrical wiring in the United Kingdom">')

In [14]:
df.printSchema()

root
 |-- value: string (nullable = true)



### Read all text files

In [18]:
wiki = spark.sparkContext.wholeTextFiles("s3a://zihe-public/articles/*")

In [19]:
pages = wiki.flatMap(lambda x :(x[1].split('</doc>')))

In [21]:
pages.top(1) # manually stopped after 5 minutes

KeyboardInterrupt: 

In [33]:
top100

['𨔍之彼嗇斯豐',
 '𦢳殺𦢳終𠬠俸旂。',
 '𤾓𢆥𥪞𡎝𠊛些',
 '𤀘𨖲！衝葩𣾼過雹𤑫。',
 '𤀘𨖲！人民沔南英雄！',
 '𣦆戈沒局𣷭橷',
 '𡦂才𡦂命窖羅恄饒',
 '𡗶撑悁貝𦟐紅打慳',
 '𠬠𣳔些𤷍嫩滝凭鑕.',
 '\U0001fa53 U+1FA53 BLACK CHESS KNIGHT-BISHOP',
 '\U0001fa52 U+1FA52 BLACK CHESS KNIGHT-ROOK',
 '\U0001fa51 U+1FA51 BLACK CHESS KNIGHT-QUEEN',
 '\U0001fa50 U+1FA50 WHITE CHESS KNIGHT-BISHOP<br>',
 '\U0001fa4f U+1FA4F WHITE CHESS KNIGHT-ROOK<br>',
 '\U0001fa4e U+1FA4E WHITE CHESS KNIGHT-QUEEN<br>',
 '🥉 100m Freestyle S10',
 '🥈 200m Individual Medley SM10',
 '🥈 100m Breaststroke SB9',
 '🥈 100m Breaststroke S10',
 '🥈 100m Backstroke S10',
 '🥇 World Champion 100m backstroke S10',
 '🥇 Paralympic Champion 100m Breaststroke SB9',
 '🥇 European Champion 200m Individual Medley SM10 ,European Record',
 '🥇 European Champion 100m Butterfly S10',
 '🚫 = Ruled out ---',
 '🌟Aizawl F.C. (INDIA- I league )',
 '🇮🇳 Fattyabad 🇮🇳',
 '🇪🇹 El shalom Blessing Synagogue ( Ethiopia)',
 '𝜕 is described as the set of sets that have the property that the motion vectors of an object are conserved. 𝜕 can

In [34]:
AA00 = spark.sparkContext.textFile("s3a://zihe-public/articles/AA/wiki_00")

In [36]:
sentences = AA00.top(100)

In [38]:
sentences[90:100] # list of strings

['Under the pretext of a slight to their consul, the French invaded and captured Algiers in 1830. Historian Ben Kiernan wrote on the French conquest of Algeria: "By 1875, the French conquest was complete. The war had killed approximately 825,000 indigenous Algerians since 1830." French losses from 1831–51 were 92,329 dead in the hospital and only 3,336 killed in action. The population of Algeria, which stood at about 2.9 million in 1872, reached nearly 11 million in 1960. French policy was predicated on "civilizing" the country. The slave trade and piracy in Algeria ceased following the French conquest. The conquest of Algeria by the French took some time and resulted in considerable bloodshed. A combination of violence and disease epidemics caused the indigenous Algerian population to decline by nearly one-third from 1830 to 1872. During this period, a small but influential French-speaking indigenous elite was formed, made up of Berbers, mostly Kabyles. As a consequence, French govern

In [69]:
sentences_all = AA00.collect()

In [78]:
sentences_all[0:10]

['<doc id="12" url="https://en.wikipedia.org/wiki?curid=12" title="Anarchism">',
 'Anarchism',
 '',
 'Anarchism is a political philosophy and movement that rejects all involuntary, coercive forms of hierarchy. It radically calls for the abolition of the state which it holds to be undesirable, unnecessary and harmful.',
 '',
 "The timeline of anarchism stretches back to prehistory when people lived in anarchistic societies long before the establishment of formal states, kingdoms or empires. With the rise of organised hierarchical bodies, skepticism towards authority also rose, but it was not until the 19th century that a self-conscious political movement was formed. During the latest half of 19th and the first decades of 20th century, the anarchist movement flourished in most parts of the world and had a significant role in worker's struggles for emancipation. Various branches of anarchism were espoused during those times. Anarchists took part in several revolutions, most notably in the

### Train word2vec

In [81]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
import re

def clean_tokenize(sentence):  
    '''
    Takes in a long string of text, split them into words, 
    remove stop words and useless words, convert the rest to 
    lower case and return a list of words.
    '''
    useless = ['a', "the", "is", "by"] 
    stops = list(set(stopwords.words('english')))
    ignore = useless + stops
    out = []
    for i in range(len(sentence)):
        if re.match("^\<doc\sid=*", sentence[i]):
            pass
        else:
            words = re.sub("[^\w]", " ",  sentence[i]).split()    
            cleaned_text = [w.lower() for w in words if w not in ignore]
            out.append(cleaned_text)
    out = [item for sublist in out for item in sublist]
    return(out)

bag = clean_tokenize(sentences)
bag[0:10]

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['émile',
 'cohl',
 'fantasmagorie',
 '1908',
 'oldest',
 'known',
 'example',
 'became',
 'known',
 'traditional']

In [82]:
len(bag)

5173

In [67]:
import gensim, logging
from gensim.models import Word2Vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = gensim.models.Word2Vec(min_count=1, workers=4)
model.build_vocab([bag], progress_per=200000)
model.train([bag], epochs=model.epochs, total_examples = model.corpus_count)
model.save('wikiw2v.model')

2020-04-26 01:24:31,549 : INFO : collecting all words and their counts
2020-04-26 01:24:31,549 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-26 01:24:31,551 : INFO : collected 2841 word types from a corpus of 5173 raw words and 1 sentences
2020-04-26 01:24:31,552 : INFO : Loading a fresh vocabulary
2020-04-26 01:24:31,556 : INFO : effective_min_count=1 retains 2841 unique words (100% of original 2841, drops 0)
2020-04-26 01:24:31,557 : INFO : effective_min_count=1 leaves 5173 word corpus (100% of original 5173, drops 0)
2020-04-26 01:24:31,564 : INFO : deleting the raw counts dictionary of 2841 items
2020-04-26 01:24:31,565 : INFO : sample=0.001 downsamples 18 most-common words
2020-04-26 01:24:31,565 : INFO : downsampling leaves estimated 5094 word corpus (98.5% of prior 5173)
2020-04-26 01:24:31,569 : INFO : estimated required memory for 2841 words and 100 dimensions: 3693300 bytes
2020-04-26 01:24:31,570 : INFO : resetting layer weights
2020-04-2

In [68]:
model['example']

  """Entry point for launching an IPython kernel.


array([-4.9345559e-03,  2.8157511e-03, -2.8440407e-03, -2.8837807e-03,
       -1.7939520e-04, -3.3549110e-03,  1.6653471e-03, -1.3558514e-03,
        3.8675396e-04,  4.1990993e-03,  3.1809363e-04,  4.2467280e-03,
       -4.5813671e-03, -4.2068455e-03, -3.7316575e-03, -4.4005788e-03,
        3.6935061e-03,  1.7932577e-03, -4.7004703e-03, -1.2004443e-03,
       -4.8649386e-03,  1.0510389e-03, -8.4934472e-05, -3.8813849e-03,
        2.8940702e-05,  3.9807744e-03,  5.5749784e-04,  4.7089765e-03,
       -1.0215463e-03, -3.4138227e-03,  3.8320108e-03, -2.2564363e-03,
       -2.9475163e-03, -3.2811519e-03,  6.3652243e-04, -2.8253873e-03,
        2.6116604e-03, -1.4962563e-03,  3.5571421e-03, -4.4515328e-03,
        3.5854334e-03,  1.6904001e-03,  4.4565718e-03, -5.0842832e-03,
        1.3151071e-03, -4.5135012e-03, -3.8599975e-03, -6.3840888e-04,
       -2.3547511e-03, -1.5899342e-03, -4.2506210e-03,  2.5588400e-03,
       -1.1868946e-03,  1.0762188e-03,  7.4906473e-04, -1.3178798e-03,
      

### Topic models attempt

In [16]:
from gensim.models import TfidfModel, LsiModel
from gensim.corpora import Dictionary, WikiCorpus, MmCorpus
from gensim import similarities
from gensim import utils
import time
import sys
import logging
import os

unable to import 'smart_open.gcs', disabling that module


In [17]:
dictionary = Dictionary()
wiki = WikiCorpus(dump_file, dictionary=dictionary)

NameError: name 'dump_file' is not defined

In [None]:
print('Parsing Wikipedia to build Dictionary...')    
dictionary.add_documents(wiki.get_texts(), prune_at=None) 
keep_words = 100000 
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=keep_words)
wiki.dictionary.save_as_text('./data/dictionary.txt.bz2')