# Sentence Mining in a Serbian Corpus

Demonstration of how pyspark can load streaming xml data, lightly parse, and filter results. 

We produce examples of the possessive prounouns of Serbian for several cases, allowing students to study all forms of their declension.

In [1]:
import pyspark
from pyspark.sql.types import StringType
import time
from itertools import islice
from tqdm.notebook import tqdm
import numpy as np
import codecs
import glob

The first 100,000 lines of the corpus are included in this repo (License: CC BY-SA 4.0)

Download additional data from https://www.clarin.si/repository/xmlui/handle/11356/1063

If you do, then to speed things up, you can turn off processing of the entire multigigabyte corpus and instead just look at the first 100,000 lines with the following flag:

In [2]:
PROCESS_ENTIRE_CORPUS = True

In [3]:
corpus_files = glob.glob('data/srWaC*.xml'); print(corpus_files)

# Settings for processing smaller chunk of data
num_of_th = 48; repartition_size = num_of_th*4
chunk_size = 100000

MAX_WORDS_IN_SENTENCE = 6

['data/srWaC1.1.01.xml']


Start the pyspark cluster (locally)

In [4]:
from pyspark.sql import SparkSession
from pyspark import SparkContext


sc = SparkContext(master = "local[20]").getOrCreate()
spark = SparkSession(sc)

22/04/02 13:54:59 WARN Utils: Your hostname, MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.102 instead (on interface en0)
22/04/02 13:54:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/02 13:54:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Create the rdd

In [None]:
%%time
if PROCESS_ENTIRE_CORPUS:
    small_chunk = False
    file_rdd = spark.read.text(corpus_files, wholetext=False)
else:
    small_chunk = True
    file_rdd = spark.read.text(corpus_files[0], wholetext=False)
    file_chunk = file_rdd.take(chunk_size) 
    file_rdd = sc.parallelize(file_chunk) 

In [None]:
# Parses the xml stream to accumulate lines corresponding to the same sentence
def quickparse(part, max_len=None):
    
    state = None
    accumulator = []
    readlines = 0
    
    for line in part:
        line = line['value']
        
        readlines +=1
        
        if line == '<s>':
            state = '<s>'
            accumulator = []
            continue
        if line == '</s>':
            state = '</s>'
            if len(accumulator) < 3: continue #  or len(accumulator) > 6: continue
            tr = ['|'.join(_) for _ in np.array(accumulator).T]
            yield tr
        if state == "<s>" and line != '<g/>':
            accumulator.append(line.split('\t'))
            if max_len is not None and len(accumulator) >= max_len: state = None

Create a dataframe with columns for the different word forms: text, reformat, lemma, and language properties. Display the first 10 which have at least one possessive pronoun in it.

In [None]:
if small_chunk:
    df = file_rdd.mapPartitions(lambda x: quickparse(x, max_len=MAX_WORDS_IN_SENTENCE)).toDF(['text','reformat', 'lemma', 'feats'])
else:
    df = file_rdd.rdd.mapPartitions(lambda x: quickparse(x, max_len=MAX_WORDS_IN_SENTENCE)).toDF(['text','reformat', 'lemma', 'feats'])


df1 = df.filter(df['feats'].rlike(r'Ps\w\w\w[n|a]')).cache()


df1.show(10)

## Sentence Generation

We want to collect many sentences for language practice. Let's enumerate the different cases and genders we want to consider. We will also filter the sentences for known bad words.

In [None]:
bad_words_rot13 = ['cravf', 'xhenp', 'wrongv', 'cvčxn', 'xyvgbevf', 'oenqnivpn', 'qbwxr', 'oybiwbo', 'fabšnw', 'bqwrovgv', 'hzhxavgv']
bad_words = [codecs.decode(_, 'rot13') for _ in bad_words_rot13]

In [None]:
property_filters = [
    {'lemma':'njen', 'Case':'Acc', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'njen', 'Case':'Acc', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'njen', 'Case':'Acc', 'Gender':'Fem', 'Number':'Sing'},
    {'lemma':'njen', 'Case':'Nom', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'njen', 'Case':'Nom', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'njen', 'Case':'Nom', 'Gender':'Fem', 'Number':'Sing'},
    
    {'lemma':'moj', 'Case':'Acc', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'moj', 'Case':'Acc', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'moj', 'Case':'Acc', 'Gender':'Fem', 'Number':'Sing'},
    {'lemma':'moj', 'Case':'Nom', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'moj', 'Case':'Nom', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'moj', 'Case':'Nom', 'Gender':'Fem', 'Number':'Sing'},
    
    {'lemma':'tvoj', 'Case':'Acc', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'tvoj', 'Case':'Acc', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'tvoj', 'Case':'Acc', 'Gender':'Fem', 'Number':'Sing'},
    {'lemma':'tvoj', 'Case':'Nom', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'tvoj', 'Case':'Nom', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'tvoj', 'Case':'Nom', 'Gender':'Fem', 'Number':'Sing'},
    
    {'lemma':'njegov', 'Case':'Acc', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'njegov', 'Case':'Acc', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'njegov', 'Case':'Acc', 'Gender':'Fem', 'Number':'Sing'},
    {'lemma':'njegov', 'Case':'Nom', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'njegov', 'Case':'Nom', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'njegov', 'Case':'Nom', 'Gender':'Fem', 'Number':'Sing'},
    
    {'lemma':'naš', 'Case':'Acc', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'naš', 'Case':'Acc', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'naš', 'Case':'Acc', 'Gender':'Fem', 'Number':'Sing'},
    {'lemma':'naš', 'Case':'Nom', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'naš', 'Case':'Nom', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'naš', 'Case':'Nom', 'Gender':'Fem', 'Number':'Sing'},

    {'lemma':'vaš', 'Case':'Acc', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'vaš', 'Case':'Acc', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'vaš', 'Case':'Acc', 'Gender':'Fem', 'Number':'Sing'},
    {'lemma':'vaš', 'Case':'Nom', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'vaš', 'Case':'Nom', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'vaš', 'Case':'Nom', 'Gender':'Fem', 'Number':'Sing'},
    
    {'lemma':'njihov', 'Case':'Acc', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'njihov', 'Case':'Acc', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'njihov', 'Case':'Acc', 'Gender':'Fem', 'Number':'Sing'},
    {'lemma':'njihov', 'Case':'Nom', 'Gender':'Masc', 'Number':'Sing'},
    {'lemma':'njihov', 'Case':'Nom', 'Gender':'Neut', 'Number':'Sing'},
    {'lemma':'njihov', 'Case':'Nom', 'Gender':'Fem', 'Number':'Sing'},
]

In [None]:
# pretty printing of the metadata for a word record
def pprint(x):
    text = x['text'].split('|')
    feat = x['feats'].split('|')
    output = text[0]
    for t,f in zip(text[1:],feat[1:]):
        if f != 'Z': 
            output += ' ' + t
        else:
            output += t
            
    return output

In [None]:
%%time 

TO_MINE = 20 # return at most this number of sentences

pbar = tqdm(property_filters)
for pf in pbar:
    print(pf)
    lemma = pf['lemma']
    case = pf['Case'][0].lower()
    gender = pf['Gender'][0].lower()
    number = pf['Number'][0].lower()
    ffeat = r'Ps' + r'\w' + gender + number + case 
    
    df2 = df1.filter(df1['feats'].rlike(ffeat))
    df3 = df2.filter(df2['lemma'].rlike(lemma))
    
    taken =  df3.take(TO_MINE)
    for _ in list(set(taken)):
        disallowed = np.any([b in _['lemma'] for b in bad_words])
        if not disallowed:
            print('\t',pprint(_))
            pass
        else: 
            pass
            print('bad words found!', pprint(_))    

Not every case/gender/number is present in the first 100,000 lines.