In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext

from pyspark.sql.types import StringType, IntegerType, ArrayType, FloatType, TimestampType, DateType
from pyspark.sql.functions import udf, dense_rank, desc, asc
from pyspark.sql.functions import mean as _mean, min as _min, max as _max, sum as _sum, count as _count, datediff, to_date
from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, lit, datediff, col, date_format


from pyspark.sql import functions as F, Row
from pyspark.sql.functions import collect_set
from pyspark.sql.window import Window

# web api
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
import urllib.parse as urlparse
from urllib.parse import urlencode

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from pylab import rcParams
import math
rcParams['figure.figsize'] = 15, 8

In [2]:
# method that print only blanks
def print_blanks(n=10):
    for i in range(n):
        print('\n')

# method that create connection with spark server
def create_spark_context(master_ip='127.0.0.1'):
    master_ip = 'spark://{}:7077'.format(master_ip)
    spark = SparkSession.builder \
        .master(master_ip)  \
        .enableHiveSupport() \
        .getOrCreate()
    
    sc = spark.sparkContext
    return (spark, sc)

spark, sc = create_spark_context()


In [11]:
input_filename = '../ressources/data/financial_sells_100000.csv'
csv_separator = ','


# read csv file
init_flat_data = spark.read         \
    .option("sep", csv_separator)  \
    .csv(input_filename, header=True)

company_names = init_flat_data \
    .select('company_id', 'company_name') \
    .distinct()

In [13]:
def get_company_news(company_name):
    url = 'https://news.google.com/news/rss/headlines/section/topic/BUSINESS?'
    company_name = company_name # nom de l'entreprise
    parameters = {
        'q': company_name, # query phrase
        'pageSize': 2, 
        'en': 'en' ,
    }
    
    url = url + urlencode(parameters)

    rss_reader = urlopen(url)
    rss_data = rss_reader.read()
    rss_reader.close()

    soup_page = soup(rss_data, "xml")
    news_list = soup_page.findAll("item")
    return news_list


def analyse_company_reputation(company_id, company_name, _analyzer):
    global_result = {
        'company_id': company_id,
        'company_name': company_name,
        'negative': 0,
        'positive': 0,
        'neutre': 0,
        'flag_gravity' : 0,
        'flag': 'No Flag',
        'neg_tweets' : []
    }
    
    
    # parse all news
    for news in get_company_news(company_name):
        analyse_result = _analyzer.polarity_scores(news.title.text)
        global_result['negative'] += analyse_result['neg']
        global_result['positive'] += analyse_result['pos']
        global_result['neutre'] += analyse_result['neu']

        if(analyse_result['neg'] > 0.2):
            global_result['neg_tweets'].append( news.title.text )
    
    flags = ['green', 'orange', 'red', 'red', 'red']
    global_result['flag_gravity'] = int( global_result['negative'] / global_result['positive'] )
    global_result['flag'] = flags[global_result['flag_gravity']]
    
    return global_result

In [14]:
# initialize sentimental analyze
sentimental_analyser = SentimentIntensityAnalyzer()

#udh method that convert array to string with  | as separator
def_array2str = udf(lambda x: '|'.join(x), StringType())


result_reputation = company_names \
    .rdd \
    .map(lambda x: Row(**analyse_company_reputation(x.company_id, x.company_name, sentimental_analyser)) ) \
    .toDF() \
    .withColumn('neg_tweets', def_array2str('neg_tweets'))

result_reputation.show(1)

+----------+--------------------+-----+------------+--------------------+------------------+------+-----------------+
|company_id|        company_name| flag|flag_gravity|          neg_tweets|          negative|neutre|         positive|
+----------+--------------------+-----+------------+--------------------+------------------+------+-----------------+
|        32|Ping An Insurance...|green|           0|Hong Kong shares ...|2.6679999999999997|87.903|9.427999999999999|
+----------+--------------------+-----+------------+--------------------+------------------+------+-----------------+
only showing top 1 row



In [6]:
#result_reputation.coalesce(1) \
#    .write   \
#    .format("csv") \
#    .mode('overwrite') \
#    .option("header", "true") \
#    .save('./ressources/data/4_sentimental_analysis_output')

In [17]:
# save result in my postgresql database
mode = "overwrite"
table_name = 'algo_sentimental_analysis'
url = "jdbc:postgresql://127.0.0.1:5432/financial_opportunities"
properties = {
    "user": "zouhairhajji",
    "password": '',
    "driver": "org.postgresql.Driver"
}
result_reputation  \
        .write     \
        .jdbc(url=url, table=table_name, mode=mode, properties=properties)