In [4]:
import pyspark.sql.functions as func
from pyspark.sql.types import StringType, DoubleType, ArrayType
import string
import re
import math

### Get sample data

In [None]:
%%time
df = spark.read.format('json').load('hdfs://orion11:11001/reddit/*/*')
print(df.count())

In [None]:
sampleDF = df.sample(False, 0.1)
sampleDF.write.format('json').save('hdfs://orion11:11001/sampled_reddit')

In [1]:
%%time
sampleDF = sqlContext.read.json("hdfs://orion11:11001/sampled_reddit/*")
sampleDF.cache()
print(sampleDF.count())

309199315
CPU times: user 82.6 ms, sys: 41.7 ms, total: 124 ms
Wall time: 7min 58s


In [None]:
sampleDFv2 = sampleDF.sample(False, 0.1)
sampleDFv2.write.format('json').save('hdfs://orion11:11001/sampled_reddit_v2')

In [1]:
%%time
sampleDFv2 = spark.read.format('json').load('hdfs://orion11:11001/sampled_reddit_v2/*')
sampleDFv2.cache()
print(sampleDFv2.count())

30907764
CPU times: user 14.3 ms, sys: 6.51 ms, total: 20.9 ms
Wall time: 1min 15s


In [2]:
sampleDFv3 = sampleDFv2.sample(False, 0.1)
sampleDFv3.write.format('json').save('hdfs://orion11:11001/sampled_reddit_v3')

In [2]:
%%time
sampleDFv3 = spark.read.format('json').load('hdfs://orion11:11001/sampled_reddit_v3/*')
sampleDFv3.cache()
print(sampleDFv3.count())

3090865
CPU times: user 3.01 ms, sys: 2.14 ms, total: 5.15 ms
Wall time: 13.9 s


### Filter all comment generate by bot/ comment that is delete

In [2]:
%%time
botExpr = "[bB][oO][tT]"

filteredComment = (sampleDF
                   .filter(~(sampleDF.body.like("[deleted]") 
                             | sampleDF.body.like('[removed]') 
                             | sampleDF.author.rlike(botExpr) 
                             | sampleDF.author.like("[deleted]")
                            )
                          )
                  )
sampleDF.unpersist()
filteredComment.cache()
print(filteredComment.count())

286117107
CPU times: user 48.6 ms, sys: 27.5 ms, total: 76.1 ms
Wall time: 4min 45s


### Look up for comment that might contain the Countries name
To guess that if the comment may contain the countries, we will check if the comment contain the substring __*m from*__ or __*come from*__ for example:   
* "I'm from Vietnam"  
* "I come from Turkey"  
Then we will check the word next to the keyword to see if it is the country name by using __pycountry__ library. If it is a country, we will add to the set to make sure that we will not generate duplicate for that user.  
After that, we only select the user that introduce about their contry and keep it in dataframe to use later.

In [37]:
import re
import pycountry

def preProcessBody(text):
    # lowercase
    text=text.lower()
    
    # remove special characters and digits
#     text=re.sub("(\\d|[^\\w|\\s]|(\_))+","",text)
#     text=re.sub("(\\s)+"," ",text)
    text=re.sub("[^A-Za-z]+"," ",text)
    return text.strip()

def getCountryKeyWord(text):
    text = preProcessBody(text)
    pattern = re.compile('(m from)|(come from)')
#     s = "I'm from Belgium I'll insult that farce for a sport as many names as I damn well please. I'm from Vietnam"
    # l = re.compile("(?<!^)\s+(?=[A-Z])(?!.\s)").split(s)
    subString = pattern.split(text)
    subString = list(filter(None, subString))
#     print(subString)
    numSubString = len(subString)
    result = set([])
    for i in range(numSubString-1):
#         print(subString[i+1])
        if pattern.match(subString[i]):
            words = subString[i+1].split()
            if len(words)>0:
                firstWord = subString[i+1].split()[0]
                name = pycountry.countries.get(name=firstWord.capitalize())
                officialName = pycountry.countries.get(official_name=firstWord.capitalize())
                if name or officialName:
                    result.add(firstWord)
    return list(result)
            
getCountryKeyWordUdf = func.udf(getCountryKeyWord, ArrayType(StringType()))
        

In [51]:
%%time
authorCountries = ((filteredComment.withColumn("countries", getCountryKeyWordUdf(filteredComment.body)))
 .filter(func.size(func.col('countries'))>0)
 .select('author','countries'))

CPU times: user 4.09 ms, sys: 222 µs, total: 4.31 ms
Wall time: 36.6 ms


In [58]:
def mergeCountries(countriesList):
    result = set([])
    for countries in countriesList:
        for country in countries:
            result.add(country);
    return list(result)

mergeCountriesUdf = func.udf(mergeCountries, ArrayType(StringType()))

In [59]:
authorCountries = (authorCountries.groupBy('author')
                   .agg(mergeCountriesUdf(func.collect_list(authorCountries.countries))))
authorCountries.show()

+-----------------+---------------------------------------------+
|           author|mergeCountries(collect_list(countries, 0, 0))|
+-----------------+---------------------------------------------+
|            24man|                                     [norway]|
|        AdowTatep|                                     [brazil]|
|         Aggron82|                                     [kuwait]|
|         Anatummy|                                     [france]|
|    AnnoyingSwede|                                     [sweden]|
|        Astarmoth|                                     [mexico]|
|      Bart_olomeo|                                    [belgium]|
|        Bennators|                                     [canada]|
|      BerntBrakar|                                     [norway]|
| BetweenTheLayers|                                  [singapore]|
| BlueBarracudaBro|                                     [canada]|
|         Bohne_13|                                    [germany]|
|     Boru

In [41]:
filteredComment.filter(filteredComment.author == 'spoleto').count()

472

In [36]:
getCountryKeyWord("I'm from Belgium I'll insult that farce for a sport as many names as I damn well please. I'm from Vietnam")

['i ', 'm from', ' belgium i ll insult that farce for a sport as many names as i damn well please i ', 'm from', ' vietnam']
m from
 belgium i ll insult that farce for a sport as many names as i damn well please i 
m from
 vietnam


['belgium']

### Calculate setiment score for an author at all subreddit

In [None]:
import nltk # be sure to have stopwords installed for this using nltk.download_shell()
import string
from pyspark.sql import functions as func
from pyspark.sql import types as types

# This Cell takes 4 minutes... (199923 records for soccer)
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# install Vader and make sure you download the lexicon as well
sid = SentimentIntensityAnalyzer()

def calculatue_score(listBody):    
    #0.1 sample
    #newsamp = sqlDF.filter(sqlDF.subreddit.like(category)&~sqlDF['author'].isin(['[deleted]']))
    
    #0.01 sample
    #newsamp = originDF.filter(originDF.subreddit.like(category)&~originDF['author'].isin(['[deleted]']))
#     newsamp = originDF.filter(originDF['subreddit'].isin(category)&~originDF['author'].isin(['[deleted]']))    
#     iteratebody = newsamp.select("body").rdd.flatMap(list).collect()    
    
    if(len(listBody)>100):                            
        # this step will return an error if you have not installed the lexicon 
        result = 0.0;
        for message in listBody:   
            #clean the comments
            message = pre_process(message)

            ss = sid.polarity_scores(message)
            result += ss["compound"]

        #print(summary)
        return result
    else:
        #print('Sorry less than 100 comments.')
        return 0
    
calculatue_score_udf = func.udf(calculatue_score, types.FloatType())