#### **Dependencies**
_____

In [1]:
import re
import json
import csv

from pyspark import SparkConf, SparkContext
from collections import Counter

#### **Constants**
_____

In [2]:
DATASET  = '../resources/sample2.txt'
REG_EXPR = "'?([_-a-zA-z0-9']+)'?"

#### **Create Spark Session**
_____

In [3]:
conf = SparkConf().setMaster('local').setAppName('Assignment 2')
sc = SparkContext(conf=conf)

#### **Load input file**
____

In [4]:
rdd = sc.textFile(DATASET)

#### **Mapping Functions**
____

In [5]:
def split_word(content):
    pattern = re.compile(r'{}'.format(REG_EXPR))
    matches = pattern.finditer(content)
    words   = []
    
    for match in matches:
        words.append(match.group(0))
    
    return words

#### **Count word occurrences and representations**
____

In [55]:
#for word in rdd.flatMap(split_word).collect():
#    print (word)
#    break
#rdd.collect()

#rdd.flatMap(split_word).collect()

#print (Counter(rdd.flatMap(split_word).take(1)))

#print(Counter("Adam Acosta"))

#print(type(rdd.flatMap(split_word).take(1)))

#print(Counter("Adam Acosta abcd"))



### Solution
#words = rdd.flatMap(split_word)
#total_words = words.count()
#words.map(lambda item: (item.strip().lower(), item)).countByKey()
#words.map(lambda item: (item.strip().lower(), item)).reduceByKey(lambda k,v: k+","+v).collect()



5

In [6]:
# filter all words inside the document
words = rdd.flatMap(split_word)
total_words = words.count()


# foreach word
for record in words.map(lambda item: (item.strip().lower(), item)).reduceByKey(lambda k,v: k+","+v).collect():
    #print(record)
    print(f"Key    : {record[0]}")
    print(f"Values : {record[1]}")
    print(f"Size :  {len(record[1].split(','))}")
    print(f"Percentage : {len(record[1].split(','))/total_words}")
    print(f"Type of value : {type(record[1])}")
    print(f"Set of value : {set(record[1].split(','))}")
    print("")



Key    : word
Values : word,Word,word,WoRd,word,word,word
Size :  7
Percentage : 0.875
Type of value : <class 'str'>
Set of value : {'Word', 'word', 'WoRd'}

Key    : apple
Values : apple
Size :  1
Percentage : 0.125
Type of value : <class 'str'>
Set of value : {'apple'}



In [7]:
words.flatMap(lambda word: word).map(lambda char: (char.lower(), char)).reduceByKey(lambda k,v: k+","+v).collect()

[('w', 'w,W,w,W,w,w,w'),
 ('o', 'o,o,o,o,o,o,o'),
 ('r', 'r,r,r,R,r,r,r'),
 ('d', 'd,d,d,d,d,d,d'),
 ('a', 'a'),
 ('p', 'p,p'),
 ('l', 'l'),
 ('e', 'e')]

In [10]:
words.flatMap(lambda word: word).count()

33

In [37]:
stats = {}
total_words = 0

#for words in rdd.filter(lambda line: len(line.strip()) > 0).flatMap(split_word).collect():
for words in rdd.flatMap(split_word).collect():
    #for word in words:
        
    total_words = total_words + 1

    word_key = words.strip().lower()

    if word_key in stats:
        stats[word_key]['occurrences'] = stats[word_key]['occurrences'] + 1

        if not words in stats[word_key]['representations']:
            stats[word_key]['representations'].append(words)

    else:
        representations = []            
        representations.append(words) 

        record = {
            "occurrences"     : 1, 
            "representations" : representations, 
        }

        stats[word_key] = record


#### **Frequency Analysis**
____

In [38]:
for word in stats:
    stats[word]['percentage'] = float(stats[word]['occurrences'] / total_words)

print(json.dumps(stats,indent=4))  

{
    "this": {
        "occurrences": 4,
        "representations": [
            "This"
        ],
        "percentage": 0.15384615384615385
    },
    "is": {
        "occurrences": 5,
        "representations": [
            "is"
        ],
        "percentage": 0.19230769230769232
    },
    "line": {
        "occurrences": 5,
        "representations": [
            "line"
        ],
        "percentage": 0.19230769230769232
    },
    "1": {
        "occurrences": 1,
        "representations": [
            "1"
        ],
        "percentage": 0.038461538461538464
    },
    "2": {
        "occurrences": 1,
        "representations": [
            "2"
        ],
        "percentage": 0.038461538461538464
    },
    "3": {
        "occurrences": 1,
        "representations": [
            "3"
        ],
        "percentage": 0.038461538461538464
    },
    "5": {
        "occurrences": 1,
        "representations": [
            "5"
        ],
        "percentage": 0.038461538461

#### **Export Results as a CSV file**
____

In [37]:
field_names = ['word', 'percentage', 'occurrences', 'representations']

with open("output.csv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=field_names)
    
    writer.writeheader()
    
    for k,v in stats.items():
        row = {
            'word'            : k,
            'percentage'      : v['percentage'],
            'occurrences'     : v['occurrences'],
            'representations' : ",".join(v['representations'])
        }
        #{'occurrences': 4, 'representations': ['word', 'Word', 'WoRd'], 'percentage': 0.8}
        #print(v)
        
        #row = {'word' : k}
        #row.update(v)
        
        writer.writerow(row)
    
    
    

#### **Shutdown SparkSession**
____

In [42]:
sc.stop()

In [11]:
sample_dict = {
    'name' : 'some name',
    'age' : 100
}

In [13]:
print(dir(sample_dict))

['__class__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'clear', 'copy', 'fromkeys', 'get', 'items', 'keys', 'pop', 'popitem', 'setdefault', 'update', 'values']


In [17]:
print(sample_dict.__dir__())

['__repr__', '__hash__', '__getattribute__', '__lt__', '__le__', '__eq__', '__ne__', '__gt__', '__ge__', '__iter__', '__init__', '__len__', '__getitem__', '__setitem__', '__delitem__', '__contains__', '__new__', '__sizeof__', 'get', 'setdefault', 'pop', 'popitem', 'keys', 'items', 'values', 'update', 'fromkeys', 'clear', 'copy', '__doc__', '__str__', '__setattr__', '__delattr__', '__reduce_ex__', '__reduce__', '__subclasshook__', '__init_subclass__', '__format__', '__dir__', '__class__']


In [19]:
file_name="/tmp/result.txt"

file_name.split(".")

['/tmp/result', 'txt']

In [27]:
"_word_stats_".join(list(file_name.split(".")[0]))

'/_word_stats_t_word_stats_m_word_stats_p_word_stats_/_word_stats_r_word_stats_e_word_stats_s_word_stats_u_word_stats_l_word_stats_t'

In [25]:
file_name.split(".")

'/tmp/result'

In [30]:
file_name.split(".")[0].concat("_word_stats")

AttributeError: 'str' object has no attribute 'concat'

SyntaxError: invalid syntax (<ipython-input-32-c46f1db6c875>, line 1)

In [40]:
"".join(file_name.split(".")[0], "_word_stats.", file_name.split(".")[1])

TypeError: join() takes exactly one argument (3 given)