#### **Dependencies**
_____

In [9]:
import re
import json
import csv

from pyspark import SparkConf, SparkContext

#### **Constants**
_____

In [2]:
DATASET  = 'resources/sample2.txt'
REG_EXPR = "'?([_-a-zA-z0-9']+)'?"

#### **Create Spark Session**
_____

In [3]:
conf = SparkConf().setMaster('local').setAppName('Assignment 2')
sc = SparkContext(conf=conf)

#### **Load input file**
____

In [4]:
rdd = sc.textFile(DATASET)

#### **Word Split Function**
____

In [5]:
def split_word(content):
    pattern = re.compile(r'{}'.format(REG_EXPR))
    matches = pattern.finditer(content)
    words   = []
    
    for match in matches:
        words.append(match.group(0))
    
    return words

#### **Count word occurrences and representations**
____

In [6]:
stats = {}
total_words = 0

for words in rdd.filter(lambda line: len(line.strip()) > 0).map(split_word).collect():
    for word in words:
        
        total_words = total_words + 1
        
        word_key = word.strip().lower()
        
        if word_key in stats:
            stats[word_key]['occurrences'] = stats[word_key]['occurrences'] + 1
            
            if not word in stats[word_key]['representations']:
                stats[word_key]['representations'].append(word)
                
        else:
            representations = []            
            representations.append(word)
            
            record = {
                "occurrences"     : 1, 
                "representations" : representations, 
            }
            
            stats[word_key] = record


#### **Frequency Analysis**
____

In [7]:
for word in stats:
    stats[word]['percentage'] = float(stats[word]['occurrences'] / total_words)

print(json.dumps(stats,indent=4))  

{
    "word": {
        "occurrences": 4,
        "representations": [
            "word",
            "Word",
            "WoRd"
        ],
        "percentage": 0.8
    },
    "apple": {
        "occurrences": 1,
        "representations": [
            "apple"
        ],
        "percentage": 0.2
    }
}


#### **Export Results as a CSV file**
____

In [37]:
field_names = ['word', 'percentage', 'occurrences', 'representations']

with open("output.csv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=field_names)
    
    writer.writeheader()
    
    for k,v in stats.items():
        row = {
            'word'            : k,
            'percentage'      : v['percentage'],
            'occurrences'     : v['occurrences'],
            'representations' : ",".join(v['representations'])
        }
        #{'occurrences': 4, 'representations': ['word', 'Word', 'WoRd'], 'percentage': 0.8}
        #print(v)
        
        #row = {'word' : k}
        #row.update(v)
        
        writer.writerow(row)
    
    
    

#### **Shutdown SparkSession**
____

In [42]:
sc.stop()