In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import *

import pandas as pd

from collections import Counter
from itertools import islice
from os import chdir
import re
from urllib.parse import urlsplit, urlunsplit

import math

In [2]:
# The following configuration works well on machines with 256 cores and 1TB memory.
# It configures Spark in local mode and uses all the available resources.
# To run it on a machine with less memory available, please reduce spark.executor.memory & spark.driver.memory,
# and increase spark.default.parallelism & spark.sql.shuffle.partitions to reduce the memory demand.

spark = SparkSession \
    .builder \
    .config("spark.executor.memory", "1000g") \
    .config("spark.driver.memory", "1000g") \
    .config("spark.local.dir", "/mnt/vol1/tmp") \
    .getOrCreate()
sc = spark.sparkContext

In [3]:
# Spark UI url

sparkUrlParts = list(urlsplit(sc.uiWebUrl))
sparkUrlParts[1] = re.sub('^[^:]*', 'localhost', sparkUrlParts[1])
sparkUrl = urlunsplit(sparkUrlParts)

print(sparkUrl, sc.defaultParallelism)

http://localhost:4040 256


In [4]:
# Read all the corpus csv.

df = spark.read.format("csv") \
    .option("header", "true") \
    .load("../corpus")

df.rdd.getNumPartitions()

253

In [5]:
# Data validation
# print("Missing title", df.filter(df.title.isNull()).count(), "Missing text", df.filter(df.text.isNull()).count())

# Keep minimal data in memory.
df = df.select(df.text)

In [6]:
# Tokenize the corpus

regexTokenizer = RegexTokenizer(inputCol="text", outputCol="char", pattern=".", gaps=False, minTokenLength=1, toLowercase=False)
char_df = regexTokenizer.transform(df).select("char")

char_df.cache()
char_count = char_df.count()
# char_df.show()

In [32]:
char_threshold = 10 ** (math.log(char_count, 10) / 2)
char_threshold

1443.8739557177414

In [33]:
# Helper function to generate and count all ngrams then write them to a single csv.
char_blacklist = "[\u0000-\u0019\u0021-\u00FF\u2000-\u206F\u3002\uff1f\uff01\uff0c\u3001\uff1b\uff1a\u201c\u201d\u2018\u2019\uff08\uff09\u300a\u300b\u3008\u3009\u3010\u3011\u300e\u300f\u300c\u300d\ufe43\ufe44\u3014\u3015\u2026\u2014\uff5e\ufe4f\uffe5\"']"
# del 　
def gen_ngram(n):
    ngram_gen = NGram(n=n, inputCol="char", outputCol="ngrams_list")
    ngram_df = ngram_gen.transform(char_df).select(explode('ngrams_list').alias('ngrams'))
    # ngram_df = ngram_df.groupBy('ngrams').count().orderBy('count', ascending=False)
    # prct = ngram_df.agg(percentile_approx('count', [0.5, 0.75, 0.875, 0.9, 0.95, 0.99]))
    # print(prct.collect())
    # ngram_df = ngram_df.limit(int(ngram_df.count() * 0.25))
    ngram_df = ngram_df.filter(~col('ngrams').rlike(char_blacklist)) # Remove ngrams with blacklisted chars.
    ngram_df = ngram_df.groupBy('ngrams').count()
    ngram_df = ngram_df.filter(ngram_df['count'] >= char_threshold)
    #if n > 1:
        # By precentile
        # ngram_df = ngram_df.limit(int(ngram_df.count() * 0.25))
        # ngram_df = ngram_df.filter(ngram_df['count'] >= char_threshold)
    
    return ngram_df

In [34]:
# Generate ngram with different lengths

ngram_result_df = None
ngram_max_len = 6

for n in range(1, ngram_max_len + 1):
    print("Generating " + str(n) + "-ngram...")
    ngram_n_df = gen_ngram(n)
    if ngram_result_df == None:
        ngram_result_df = ngram_n_df
    else:
        ngram_result_df = ngram_result_df.unionByName(ngram_n_df)

# ngram_result_df = ngram_result_df.orderBy(col("count").desc())
ngram_result_df.cache()

Generating 1-ngram...
Generating 2-ngram...
Generating 3-ngram...
Generating 4-ngram...
Generating 5-ngram...
Generating 6-ngram...


DataFrame[ngrams: string, count: bigint]

In [35]:
# Start Spark calcuation
ngram_result_df.count()

128554

In [36]:
# Collect the result into Pythonland for processing.
ngram_result_rows = ngram_result_df.collect()

In [37]:
# Convert rows into a Counter dict.

ngram_result_dict = Counter()

for row in ngram_result_rows:
    ngrams_str = row.ngrams[::2] # NGram inserts space between chars. Remove them.
    has_ascii = any(ord(c) < 256 for c in ngrams_str)
    if ' ' in ngrams_str or has_ascii: continue # Filter out any strings containing space
    ngram_result_dict[ngrams_str] = row['count']
    # print(ngrams_str)
    # Calculate total count of single char.
    if len(ngrams_str) == 1:
        ngram_result_dict[ngrams_str[:-1]] += row['count']

In [79]:
# Calculate freqency from count

ngram_prob_dict = dict()
ngram_total_prob_dict = dict()
ngram_total_prob_dict[''] = 1

for ngram_str in ngram_result_dict:
    if ngram_str == '': continue
    parent_freq = ngram_result_dict[ngram_str[:-1]]
    parent_total_prob = ngram_total_prob_dict.get(ngram_str[:-1], 0)
    if parent_freq == 0 or parent_total_prob == 0: continue
    freq = ngram_result_dict[ngram_str] / parent_freq
    total_prob = parent_total_prob * freq
    
    if freq >= 0.00002 or len(ngram_str) == 1:
        ngram_prob_dict[ngram_str] = freq
        ngram_total_prob_dict[ngram_str] = total_prob

# Remove suffix
count = 0
for ngram_str in ngram_total_prob_dict:
    if len(ngram_str) <= 2: continue
    ngram_suffix = ngram_str[1:]
    ngram_prob = ngram_total_prob_dict.get(ngram_str, 0)
    suffix_prob = ngram_total_prob_dict.get(ngram_suffix, 0)
    prob_diff = ngram_prob - suffix_prob
    if -1e-6 <= prob_diff and prob_diff <= 1e-6:
        del ngram_prob_dict[ngram_suffix]
        #print('deleting ' + ngram_suffix + ' from ' + ngram_str)
        count += 1
        #if count > 290: break

# print(count)
        
ngram_prob_dict[''] = 1

14348


In [80]:
# Dump the result into a csv file.

output_path = "6gram-prob-nosym-lite.csv"
with open(output_path, "w") as f:
    f.write("ngram,freq,total_freq\n")
    for ngram_str in ngram_prob_dict:
        total_prob = ngram_prob_dict.get(ngram_str[:-1], 0) * ngram_prob_dict.get(ngram_str, 0)
        if ngram_str == '': continue
        f.write(ngram_str + "," + str(ngram_prob_dict[ngram_str]) + "," + str(ngram_total_prob_dict[ngram_str]) + "\n")