In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

In [2]:
import pandas as pd
pdf = pd.DataFrame({
        'texts': [['I', 'like', 'playing', 'basketball'],
                 ['I', 'like', 'coding'],
                 ['I', 'like', 'machine', 'learning', 'very', 'much']]
    })
df = spark.createDataFrame(pdf)
df.show(truncate=False)

+----------------------------------------+
|texts                                   |
+----------------------------------------+
|[I, like, playing, basketball]          |
|[I, like, coding]                       |
|[I, like, machine, learning, very, much]|
+----------------------------------------+



## Ngrams and collocations
Transform texts to 2-grams, 3-grams, and 4-grams collocations

In [4]:
from pyspark.ml.feature import NGram
from pyspark.ml import Pipeline

In [6]:
ngrams = [NGram(n=n, inputCol='texts',outputCol=str(n)+'-grams') for n in  [2,3,4]]

In [7]:
pipeline = Pipeline(stages=ngrams)

In [8]:
texts_grams = pipeline.fit(df).transform(df)

In [10]:
texts_grams.select('2-grams').show(truncate=False)

+------------------------------------------------------------------+
|2-grams                                                           |
+------------------------------------------------------------------+
|[I like, like playing, playing basketball]                        |
|[I like, like coding]                                             |
|[I like, like machine, machine learning, learning very, very much]|
+------------------------------------------------------------------+



In [11]:
texts_grams.select('3-grams').show(truncate=False)

+----------------------------------------------------------------------------------+
|3-grams                                                                           |
+----------------------------------------------------------------------------------+
|[I like playing, like playing basketball]                                         |
|[I like coding]                                                                   |
|[I like machine, like machine learning, machine learning very, learning very much]|
+----------------------------------------------------------------------------------+



In [12]:
texts_grams.select('4-grams').show(truncate=False)

+---------------------------------------------------------------------------------+
|4-grams                                                                          |
+---------------------------------------------------------------------------------+
|[I like playing basketball]                                                      |
|[]                                                                               |
|[I like machine learning, like machine learning very, machine learning very much]|
+---------------------------------------------------------------------------------+



In [13]:
## Access corpora from the NLTK package
### The Gutenberg corpus
#### Get file ids in gutenberg corpos

In [14]:
from nltk.corpus import gutenberg

In [16]:
import nltk

In [17]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/liziwei/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [18]:
guten_fileids = gutenberg.fileids()

In [20]:
guten_fileids

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

#### Absolute path of a file

In [22]:
gutenberg.abspath(guten_fileids[0])

FileSystemPathPointer('/Users/liziwei/nltk_data/corpora/gutenberg/austen-emma.txt')

#### Raw text

In [26]:
gutenberg.raw(guten_fileids[0])[:200]

'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; an'

#### Sentences of a specific file

In [28]:
gutenberg.sents(guten_fileids[0])

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'], ['VOLUME', 'I'], ...]

In [29]:
len(gutenberg.sents(guten_fileids[0]))

7752

### Loading custom corpus

In [32]:
from nltk.corpus import PlaintextCorpusReader
corpus_data = PlaintextCorpusReader('/Users/liziwei/Desktop/bigdata/learningSpark/data','.*')

#### Files in the corpus data

In [34]:
data_fieids = corpus_data.fileids()
data_fieids

['.DS_Store',
 'Advertising.csv',
 'Credit.csv',
 'WineData.csv',
 'airquality.csv',
 'churn-bigml-20.csv',
 'churn-bigml-80.csv',
 'cuse_binary.csv',
 'horseshoe_crab.csv',
 'hsb2.csv',
 'hsb2_modified.csv',
 'iris.csv',
 'kaggle-titanic-gender_submission.csv',
 'kaggle-titanic-test.csv',
 'kaggle-titanic-train.csv',
 'mtcars.csv',
 'prostate.csv',
 'saved-mtcars/.part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv.crc',
 'saved-mtcars/_SUCCESS',
 'saved-mtcars/part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv',
 'saved-twitter/.part-00000.crc',
 'saved-twitter/_SUCCESS',
 'saved-twitter/part-00000',
 'titanic/gender_submission.csv',
 'titanic/test.csv',
 'titanic/train.csv',
 'twitter.txt']

#### Raw text in twitter.txt

In [35]:
corpus_data.raw('twitter.txt')

'Fresh install of XP on new computer. Sweet relief! fuck vista\t1018769417\t1.0\nWell. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl\t10284216536\t1.0\n"Literally six weeks before I can take off ""SSC Chair"" off my email. Its like the torturous 4th mile before everything stops hurting."\t10298589026\t1.0\nMitsubishi i MiEV - Wikipedia, the free encyclopedia - http://goo.gl/xipe Cutest car ever!\t109017669432377344\t1.0\n\'Cheap Eats in SLP\' - http://t.co/4w8gRp7\t109642968603963392\t1.0\nTeenage Mutant Ninja Turtle art is never a bad thing... http://bit.ly/aDMHyW\t10995492579\t1.0\nNew demographic survey of online video viewers: http://bit.ly/cx8b7I via @KellyOlexa\t11713360136\t1.0\nhi all - i\'m going to be tweeting things lookstat at the @lookstat twitter account. please follow me there\t1208319583\t1.0\nHoly carp, no. That movie will seriously suffer for it. RT @MouseInfo: Anyone excited for The Little Mermaid in 3D?\t121330835726155776\t1.0\n"Di

#### Words and sentences in twitter.txt

In [36]:
corpus_data.words(fileids='twitter.txt')

['Fresh', 'install', 'of', 'XP', 'on', 'new', ...]

In [37]:
len(corpus_data.words(fileids='twitter.txt'))

253

In [38]:
corpus_data.sents(fileids='twitter.txt')

[['Fresh', 'install', 'of', 'XP', 'on', 'new', 'computer', '.'], ['Sweet', 'relief', '!'], ...]

In [39]:
len(corpus_data.sents(fileids='twitter.txt'))

14

### WordNet
The nltk.corpus.wordnet.synsets() function load all synsents witha given lemma and part of speech tag

In [41]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/liziwei/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [42]:
from nltk.corpus import wordnet
wordnet.synsets

<bound method WordNetCorpusReader.synsets of <WordNetCorpusReader in '/Users/liziwei/nltk_data/corpora/wordnet'>>

In [43]:
pdf = pd.DataFrame({
        'car_synsets': [synsets._name for synsets in wordnet.synsets('car')]
    })

In [44]:
df = spark.createDataFrame(pdf)

In [45]:
df.show()

+--------------+
|   car_synsets|
+--------------+
|      car.n.01|
|      car.n.02|
|      car.n.03|
|      car.n.04|
|cable_car.n.01|
+--------------+



### Get lemma names given a synset

In [47]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from nltk.corpus import wordnet

def lemma_names_from_synset(x):
    synset = wordnet.synset(x)
    return synset.lemma_names()

lemma_names_from_synset('car.n.02')

['car', 'railcar', 'railway_car', 'railroad_car']