# 103590450 四資四 馬茂源

In [116]:
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.utils import AnalysisException
from pyspark.ml.feature import RegexTokenizer
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import os, math, time

In [2]:
conf = (SparkConf()
        #.setMaster('spark://10.100.5.182:7077')
        .setMaster("local")
        .setAppName("hw1"))

In [3]:
try:
    sc = SparkContext(conf=conf)
    sql_sc = SQLContext(sc)
except ValueError:
    pass

In [8]:
files = {'fb':['Facebook_Economy.csv', 
               'Facebook_Obama.csv', 
               'Facebook_Palestine.csv', 
               'Facebook_Microsoft.csv'],
        'google':['GooglePlus_Obama.csv', 
                  'GooglePlus_Palestine.csv', 
                  'GooglePlus_Economy.csv', 
                  'GooglePlus_Microsoft.csv'],
        'linkedin':['LinkedIn_Microsoft.csv', 
                    'LinkedIn_Palestine.csv',
                    'LinkedIn_Obama.csv', 
                    'LinkedIn_Economy.csv'],
        'news':'News_Final.csv'}

### In news data, count the words in two fields: ‘Title’ and ‘Headline’ respectively, and list the most frequent words according to the term frequency in descending order, in total, per day, and per topic, respectively

* IDLink (numeric): Unique identifier of news items
* Title (string): Title of the news item according to the official media sources
* Headline (string): Headline of the news item according to the official media sources
* Source (string): Original news outlet that published the news item
* Topic (string): Query topic used to obtain the items in the official media sources
* PublishDate (timestamp): Date and time of the news items' publication
* SentimentTitle (numeric): Sentiment score of the text in the news items' title
* SentimentHeadline (numeric): Sentiment score of the text in the news items' headline
* Facebook (numeric): Final value of the news items' popularity according to the social media source Facebook
* GooglePlus (numeric): Final value of the news items' popularity according to the social media source Google+
* LinkedIn (numeric): Final value of the news items' popularity according to the social media source LinkedIn

In [18]:
news = sql_sc.read.csv(files['news'], sep=',', header=True)

In [118]:
news_data = news.select(['title', 'headline' , 'topic', 'publishDate'])
news_data.show()

+--------------------+--------------------+---------+-------------------+
|               title|            headline|    topic|        publishDate|
+--------------------+--------------------+---------+-------------------+
|Obama Lays Wreath...|Obama Lays Wreath...|    obama|2002-04-02 00:00:00|
|A Look at the Hea...|Tim Haywood, inve...|  economy|2008-09-20 00:00:00|
|Nouriel Roubini: ...|Nouriel Roubini, ...|  economy|2012-01-28 00:00:00|
|Finland GDP Expan...|Finland's economy...|  economy|2015-03-01 00:06:00|
|Tourism, govt spe...|Tourism and publi...|  economy|2015-03-01 00:11:00|
|Intellitec Soluti...|Over 100 attendee...|microsoft|2015-03-01 00:19:00|
| Monday, 29 Feb 2016|RAMALLAH, Februar...|palestine|2016-02-28 14:03:00|
|Obama, stars pay ...|First lady Michel...|    obama|2015-03-01 00:45:00|
|Fire claims more ...|A Hancock County ...|palestine|2015-03-01 01:20:00|
|Microsoft's new W...|New Delhi, Feb.29...|microsoft|2015-03-01 01:32:00|
|Microsoft Project...|Microsoft may ha

In [119]:
def wordTokenizer(data, columns):
    for c in columns:
        new_c = c + '_tokens'
        reTokenizer = RegexTokenizer(inputCol=c, 
                                     outputCol=new_c, 
                                     pattern='\\W', 
                                     minTokenLength=2)
        data = reTokenizer.transform(data)
    return data

In [120]:
news_data = wordTokenizer(news_data, ['title', 'headline'])
# news_data = news_data.drop('title', 'headline')
# news_data = news_data.select('title_tokens', 'headline_tokens', 
#                              'topic',  'publishDate')

In [122]:
news_data = news_data.withColumn('publishDate', 
                                 udf(lambda tmp: tmp[:10] , StringType())
                                 (news_data.publishDate))

In [123]:
news_data.show()

+--------------------+--------------------+---------+-----------+--------------------+--------------------+
|               title|            headline|    topic|publishDate|        title_tokens|     headline_tokens|
+--------------------+--------------------+---------+-----------+--------------------+--------------------+
|Obama Lays Wreath...|Obama Lays Wreath...|    obama| 2002-04-02|[obama, lays, wre...|[obama, lays, wre...|
|A Look at the Hea...|Tim Haywood, inve...|  economy| 2008-09-20|[look, at, the, h...|[tim, haywood, in...|
|Nouriel Roubini: ...|Nouriel Roubini, ...|  economy| 2012-01-28|[nouriel, roubini...|[nouriel, roubini...|
|Finland GDP Expan...|Finland's economy...|  economy| 2015-03-01|[finland, gdp, ex...|[finland, economy...|
|Tourism, govt spe...|Tourism and publi...|  economy| 2015-03-01|[tourism, govt, s...|[tourism, and, pu...|
|Intellitec Soluti...|Over 100 attendee...|microsoft| 2015-03-01|[intellitec, solu...|[over, 100, atten...|
| Monday, 29 Feb 2016|RAMALL

### In social feedback data, calculate the average popularity of each news by hour, and by day, respectively (for each platform)

###  In news data, calculate the sum and average sentiment score of each topic, respectively

### From subtask (1), for the top-100 frequent words per topic in titles and headlines, calculate their co-occurrence matrices (100x100), respectively. Each entry in the matrix will contain the co-occurrence frequency in all news titles and headlines, respectively