In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import *

import pandas as pd

from collections import Counter
from itertools import islice
from os import chdir
import re
from urllib.parse import urlsplit, urlunsplit

In [2]:
chdir('/mnt/tmpfs/output')

In [3]:
# The following configuration works well on machines with 256 cores and 1TB memory.
# It configures Spark in local mode and uses all the available resources.
# To run it on a machine with less memory available, please reduce spark.executor.memory & spark.driver.memory,
# and increase spark.default.parallelism & spark.sql.shuffle.partitions to reduce the memory demand.

spark = SparkSession \
    .builder \
    .config("spark.executor.memory", "1000g") \
    .config("spark.driver.memory", "1000g") \
    .config("spark.local.dir", "/mnt/vol1/tmp") \
    .getOrCreate()
sc = spark.sparkContext

In [4]:
# Spark UI url

sparkUrlParts = list(urlsplit(sc.uiWebUrl))
sparkUrlParts[1] = re.sub('^[^:]*', 'localhost', sparkUrlParts[1])
sparkUrl = urlunsplit(sparkUrlParts)

print(sparkUrl, sc.defaultParallelism)

http://localhost:4041 256


In [5]:
# Read all the corpus csv.

df = spark.read.format("csv") \
    .option("header", "true") \
    .load("./corpus")

df.rdd.getNumPartitions()

253

In [6]:
# Data validation
# print("Missing title", df.filter(df.title.isNull()).count(), "Missing text", df.filter(df.text.isNull()).count())

# Keep minimal data in memory.
df = df.select(df.text)

In [7]:
# Tokenize the corpus by punctuations to find all the sentense.
punc = "[ 　,.\u2000-\u206F\u3002\uff1f\uff01\uff0c\u3001\uff1b\uff1a\u201c\u201d\u2018\u2019\uff08\uff09\u300a\u300b\u3008\u3009\u3010\u3011\u300e\u300f\u300c\u300d\ufe43\ufe44\u3014\u3015\u2026\u2014\uff5e\ufe4f\uffe5\"']"

regexTokenizer = RegexTokenizer(inputCol="text", outputCol="sentence", pattern=punc, gaps=True, minTokenLength=2, toLowercase=False)
sentence_df = regexTokenizer.transform(df).select(explode("sentence"))

sentence_df.cache()
sentence_df.count()

sentence_df.show()

+---------------------------------------+
|                                    col|
+---------------------------------------+
|                               選舉快訊|
|                       左派鐵票排山倒海|
|                               陳太勢危|
|                                 本報訊|
|             今日舉行的立法會港島區補選|
|                               形勢緊湊|
|                       截至下午3:30為止|
|                                   有27|
|                            51%選民投票|
|  雖然今日上半天投票率高於上月舉行的...|
|      但較2004年立法會選舉同時間的投...|
|   部份為陳方安生拉票的泛民主派成員指出|
|                 上半日投票率略高於預期|
|         很大程度是左派製造的動員票所致|
|                   現時陳太形勢仍然危急|
|                 未投票的選民應盡快行動|
|                 對抗左派排山倒海的鐵票|
|前任政務司司長陳方安生今晨7時許步出家門|
|                           一身鮮紅衣裳|
|         在子女及公民黨黨魁余若薇陪同下|
+---------------------------------------+
only showing top 20 rows



In [10]:
sentence_df.repartition(5).write \
    .option("header", False) \
    .csv("sentences")