In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import *

import pandas as pd

from collections import Counter
from itertools import islice
from os import chdir
import re
from urllib.parse import urlsplit, urlunsplit

In [3]:
# The following configuration works well on machines with 256 cores and 1TB memory.
# It configures Spark in local mode and uses all the available resources.
# To run it on a machine with less memory available, please reduce spark.executor.memory & spark.driver.memory,
# and increase spark.default.parallelism & spark.sql.shuffle.partitions to reduce the memory demand.

spark = SparkSession \
    .builder \
    .config("spark.executor.memory", "1000g") \
    .config("spark.driver.memory", "1000g") \
    .config("spark.local.dir", "/mnt/vol1/tmp") \
    .getOrCreate()
sc = spark.sparkContext

In [4]:
# Spark UI url

sparkUrlParts = list(urlsplit(sc.uiWebUrl))
sparkUrlParts[1] = re.sub('^[^:]*', 'localhost', sparkUrlParts[1])
sparkUrl = urlunsplit(sparkUrlParts)

print(sparkUrl, sc.defaultParallelism)

http://localhost:4040 256


In [5]:
# Read all the corpus csv.

df = spark.read.format("csv") \
    .option("header", "true") \
    .load("../corpus")

df.rdd.getNumPartitions()

253

In [6]:
# Data validation
# print("Missing title", df.filter(df.title.isNull()).count(), "Missing text", df.filter(df.text.isNull()).count())

# Keep minimal data in memory.
df = df.select(df.text)

In [7]:
# Tokenize the corpus

regexTokenizer = RegexTokenizer(inputCol="text", outputCol="char", pattern=".", gaps=False, minTokenLength=1, toLowercase=False)
char_df = regexTokenizer.transform(df).select("char")

char_df.cache()
# char_df.show()

DataFrame[char: array<string>]

In [8]:
# Helper function to generate and count all ngrams then write them to a single csv.
char_blacklist = "[\u0000-\u0019\u0021-\u00FF\u2000-\u206F\u3002\uff1f\uff01\uff0c\u3001\uff1b\uff1a\u201c\u201d\u2018\u2019\uff08\uff09\u300a\u300b\u3008\u3009\u3010\u3011\u300e\u300f\u300c\u300d\ufe43\ufe44\u3014\u3015\u2026\u2014\uff5e\ufe4f\uffe5\"']"

def gen_ngram(n):
    ngram_gen = NGram(n=n, inputCol="char", outputCol="ngrams_list")
    ngram_df = ngram_gen.transform(char_df).select(explode('ngrams_list').alias('ngrams'))
    # ngram_df = ngram_df.groupBy('ngrams').count().orderBy(col("count").desc())
    ngram_df = ngram_df.filter(~col('ngrams').rlike(char_blacklist)) # Remove ngrams with blacklisted chars.
    ngram_df = ngram_df.groupBy('ngrams').count()
    
    if n > 1:
        ngram_df = ngram_df.filter(ngram_df['count'] >= 10)
    
    return ngram_df

In [9]:
# Generate ngram with different lengths

ngram_result_df = None
ngram_max_len = 16

for n in range(1, ngram_max_len + 1):
    print("Generating " + str(n) + "-ngram...")
    ngram_n_df = gen_ngram(n)
    if ngram_result_df == None:
        ngram_result_df = ngram_n_df
    else:
        ngram_result_df = ngram_result_df.unionByName(ngram_n_df)
        
ngram_result_df = ngram_result_df.orderBy(col("count").desc())
ngram_result_df.cache()

Generating 1-ngram...
Generating 2-ngram...
Generating 3-ngram...
Generating 4-ngram...
Generating 5-ngram...
Generating 6-ngram...
Generating 7-ngram...
Generating 8-ngram...
Generating 9-ngram...
Generating 10-ngram...
Generating 11-ngram...
Generating 12-ngram...
Generating 13-ngram...
Generating 14-ngram...
Generating 15-ngram...
Generating 16-ngram...


DataFrame[ngrams: string, count: bigint]

In [10]:
# Start Spark calcuation
ngram_result_df.count()

21757817

In [11]:
# Collect the result into Pythonland for processing.
ngram_result_rows = ngram_result_df.collect()

In [12]:
# Convert rows into a Counter dict.

ngram_result_dict = Counter()

for row in ngram_result_rows:
    ngrams_str = row.ngrams[::2] # NGram inserts space between chars. Remove them.
    if ' ' in ngrams_str: continue # Filter out any strings containing space
    ngram_result_dict[ngrams_str] = row['count']
    
all_ngrams = sorted(ngram_result_dict.keys(), key=len, reverse=False)

In [18]:
# The ngrams dataframe contains a lot of duplicated substring. e.g. for string ABC, substring AB and BC are also present in the dataframe.
# Remove these substrings.

def window(seq, n=2):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield "".join(result)
    for elem in it:
        result = result[1:] + (elem,)
        yield "".join(result)
        
for text in all_ngrams:
    freq = ngram_result_dict.get(text, 0)
    if freq <= 0: continue
    if len(text) < 3: continue
    # print(text, freq)
    max_n = len(text) - 1
    for substr in window(text, max_n):
        substring_freq = ngram_result_dict.get(substr, 0)
        if substring_freq == 0: continue

        substring_freq_ratio = substring_freq / freq
        if substring_freq_ratio < 1.5:
            # print("delete", substr, substring_freq, freq)
            del ngram_result_dict[substr]


In [19]:
# Dump the result into a csv file.

output_path = "apple_ngram_1_16.csv"
with open(output_path, "w") as f:
    f.write("ngram,count\n")
    for (k, v) in ngram_result_dict.most_common():
        if v < 100 or len(k) == ngram_max_len: continue
        # if k not in word_set: continue
        f.write(k + "," + str(v) + "\n")