In [189]:
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql.functions import *
from pyspark.sql.window import Window

import os
import socket
import hashlib
import string

def createMd5(text):
    return hashlib.md5(text.encode('utf-8')).hexdigest()
md5Udf= udf(lambda z: createMd5(z),StringType())

def clean_lower(text):
    sentence = text.translate(str.maketrans('', '', '!"#$%&\'()*+,./:;<=>?@[\\]^`{|}~-_”“«»‘')).lower()
    return " ".join(sentence.split())
cleanLowerUdf= udf(lambda z: clean_lower(z),StringType())

def get_site_from_url(text):
    return text.split("/")[2]
getUrl= udf(lambda z: get_site_from_url(z),StringType())

os.environ['PYSPARK_SUBMIT_ARGS'] = 'pyspark-shell'

In [2]:
minio_ip = socket.gethostbyname('minio')
spark = SparkSession \
    .builder \
    .appName("Python Spark S3") \
    .config('spark.hadoop.fs.s3a.endpoint', 'http://'+minio_ip+':9000') \
    .config("spark.hadoop.fs.s3a.access.key", "changeme1234") \
    .config("spark.hadoop.fs.s3a.secret.key", "changeme1234") \
    .config('spark.hadoop.fs.s3a.path.style.access', 'true') \
    .config('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem') \
    .getOrCreate()

In [3]:
st= StructType([
    StructField("abstract", StringType()),
    StructField("authors", StringType()),
    StructField("image", StringType()),
    StructField("metadata", StringType()),
    StructField("publish_date", TimestampType()),
    StructField("text", StringType()),
    StructField("title", StringType()),
    StructField("url", StringType()),
])

In [4]:
df_news_covid_mexico = spark.read.schema(st).option("timestampFormat", "dd-MM-yyyy").json("s3a://news/covid_mexico/*.json")

In [21]:
df_news_covid_mexico.count()

1086

In [6]:
df_news_covid_mexico.printSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- image: string (nullable = true)
 |-- metadata: string (nullable = true)
 |-- publish_date: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)



In [190]:
df_news_covid_mexico_date_text = df_news_covid_mexico.select(md5Udf("url").alias("article_id"),"title","url","publish_date",cleanLowerUdf("text").alias("clean_text"),getUrl("url").alias("site")).filter("length(text) >= 2")

In [202]:
df_news_covid_mexico_date_text.show(15)

+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+
|          article_id|               title|                 url|       publish_date|          clean_text|                site|
+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+
|b0e8ddae7b3f65a6a...|Aumenta a 16 fall...|http://www.enfoqu...|2020-03-29 00:00:00|el director gener...|www.enfoqueinform...|
|4c60ef934b04e2787...|Ya van dos gobern...|https://www.unotv...|2020-03-29 00:00:00|hasta el momento ...|       www.unotv.com|
|1a7b0873214d63a17...|López Obrador dic...|https://www.infor...|2020-03-29 00:00:00|el presidente and...|   www.informador.mx|
|cdd17bbabcd40bb3b...|Coronavirus en Mé...|https://www.milen...|2020-03-29 00:00:00|fernando damián m...|     www.milenio.com|
|cf44530bae9e92f21...|Han muerto 20 per...|https://www.milen...|2020-03-29 00:00:00|milenio digital y...|     w

In [145]:
df_news_covid_mexico_date_text.select("title").show(15,False)

+--------------------------------------------------------------+
|title                                                         |
+--------------------------------------------------------------+
|Aumenta a 16 fallecimientos por COVID-19 en México            |
|Ya van dos gobernadores de México contagiados de COVID-19     |
|López Obrador dice que se hará prueba de COVID-19 si tiene ...|
|Coronavirus en México, noticias del 29 de marzo               |
|Han muerto 20 personas por Covid-19 en México; suman 993 ...  |
|Ciudad de México arma kits para casos de COVID-19             |
|Coronavirus en México: Adán Augusto López, gobernador de ...  |
|Mantiene BC 20 casos de Covid-19 confirmados                  |
|El gobernador de Coahuila reporta 16 médicos y enfermeras ... |
|Coronavirus en México: fueron aislados tres policías de la ...|
|Más de 26 mil contagiados con Covid-19 en México, estima ...  |
|Salud estima que hay 26519 casos de COVID-19 en México        |
|Fases y consejos sobre e

In [201]:
df_news_covid_mexico_date_text.count()

851

In [131]:
url = "jdbc:postgresql://postgres/shared"
mode="overwrite"
properties = {
    "user": "shared",
    "password": os.environ['SHARED_PASSWORD']
}

In [203]:
df_news_covid_mexico_date_text.write.jdbc(url=url, table="tb_news_covid_mexico_date_text", mode=mode, properties=properties)

In [170]:
df_news_covid_mexico_palabras = df_news_covid_mexico_date_text.select("article_id","publish_date",explode(split(df_news_covid_mexico_date_text.clean_text, "\s")).alias("palabra")).where(length('palabra') > 1)

In [171]:
df_news_covid_mexico_palabras.show(30)

+--------------------+-------------------+-------------+
|          article_id|       publish_date|      palabra|
+--------------------+-------------------+-------------+
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|           el|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|     director|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|      general|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|           de|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|epidemiología|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|         josé|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|         luis|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|       alomía|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|      zegarra|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|      informó|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|          que|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|           la|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|        cifra|
|b0e8ddae7b3f65a6a...|2020-03-29 00:00:00|           de|
|b0e8ddae7b3f65a6a...|2020-03-2

In [172]:
#https://sigdelta.com/blog/word-count-in-spark-with-a-pinch-of-tf-idf/
df_news_covid_mexico_palabras.groupBy('article_id', 'palabra','publish_date')\
    .count()\
    .orderBy('count', ascending=False)\
    .show(25)

+--------------------+-------+-------------------+-----+
|          article_id|palabra|       publish_date|count|
+--------------------+-------+-------------------+-----+
|cdd17bbabcd40bb3b...|     de|2020-03-29 00:00:00|  331|
|cdd17bbabcd40bb3b...|       |2020-03-29 00:00:00|  331|
|2b7a3fb53b9905a69...|     de|2020-03-14 00:00:00|  278|
|8d6050c8ce801550d...|     de|2020-03-28 00:00:00|  238|
|63113c50ad7d9bba9...|     de|2020-04-05 00:00:00|  235|
|cedc44bfb6180a9ac...|     de|2020-04-26 00:00:00|  229|
|30d9911177d838db9...|     de|2020-04-09 00:00:00|  224|
|30d9911177d838db9...|    que|2020-04-09 00:00:00|  200|
|8d6050c8ce801550d...|       |2020-03-28 00:00:00|  182|
|cdd17bbabcd40bb3b...|     la|2020-03-29 00:00:00|  182|
|fc9909b4476318e19...|     de|2020-03-31 00:00:00|  180|
|2b7a3fb53b9905a69...|       |2020-03-14 00:00:00|  175|
|cdd17bbabcd40bb3b...|     en|2020-03-29 00:00:00|  155|
|b936e66ab964cd3c4...|     de|2020-01-05 00:00:00|  151|
|97fb634e001c433a0...|     de|2

In [180]:
#https://sigdelta.com/blog/word-count-in-spark-with-a-pinch-of-tf-idf-continued/
w = Window.partitionBy(df_news_covid_mexico_palabras['article_id'])

article_tf = df_news_covid_mexico_palabras.groupBy('article_id', 'palabra', 'publish_date')\
    .agg(count('*').alias('n_w'),sum(count('*')).over(w).alias('n_d'),(count('*')/sum(count('*')).over(w)).alias('tf'))\
    .orderBy('n_w', ascending=False)\
    .cache()

article_tf.show(truncate=15)

+---------------+-------+---------------+---+----+---------------+
|     article_id|palabra|   publish_date|n_w| n_d|             tf|
+---------------+-------+---------------+---+----+---------------+
|cdd17bbabcd4...|     de|2020-03-29 0...|331|4646|0.0712440809...|
|cdd17bbabcd4...|       |2020-03-29 0...|331|4646|0.0712440809...|
|2b7a3fb53b99...|     de|2020-03-14 0...|278|3206|0.0867124142...|
|8d6050c8ce80...|     de|2020-03-28 0...|238|3243|0.0733888374...|
|63113c50ad7d...|     de|2020-04-05 0...|235|3176|0.0739924433...|
|cedc44bfb618...|     de|2020-04-26 0...|229|3105|0.0737520128...|
|30d9911177d8...|     de|2020-04-09 0...|224|4172|0.0536912751...|
|30d9911177d8...|    que|2020-04-09 0...|200|4172|0.0479386385...|
|8d6050c8ce80...|       |2020-03-28 0...|182|3243|0.0561208757...|
|cdd17bbabcd4...|     la|2020-03-29 0...|182|4646|0.0391734825...|
|fc9909b44763...|     de|2020-03-31 0...|180|2506|0.0718276137...|
|2b7a3fb53b99...|       |2020-03-14 0...|175|3206|0.0545851528

In [181]:
w = Window.partitionBy('palabra')

c_d = df_news_covid_mexico_palabras.select('article_id').distinct().count()

article_idf = df_news_covid_mexico_palabras.groupBy('palabra', 'article_id','publish_date').agg(
        lit(c_d).alias('c_d'),
        count('*').over(w).alias('i_d'),
        log(lit(c_d)/count('*').over(w)).alias('idf')
    )\
    .orderBy('idf', ascending=False)\
    .cache()

In [182]:
article_idf.show(150, truncate=15)

+---------------+---------------+---------------+---+---+---------------+
|        palabra|     article_id|   publish_date|c_d|i_d|            idf|
+---------------+---------------+---------------+---+---+---------------+
|           201’|038da904a144...|2020-01-11 0...|851|  1|6.7464121285...|
|           2136|370853b8e0eb...|2020-01-21 0...|851|  1|6.7464121285...|
|           2162|70ffaf20effa...|2020-01-02 0...|851|  1|6.7464121285...|
|            467|722156625331...|2020-02-17 0...|851|  1|6.7464121285...|
|           4821|a78b641ef212...|2020-02-06 0...|851|  1|6.7464121285...|
|            829|5b7ffe70c01c...|2020-03-26 0...|851|  1|6.7464121285...|
|          alto”|598a349ddc37...|2020-02-14 0...|851|  1|6.7464121285...|
|     anunciando|6a0f15ef9478...|2020-03-24 0...|851|  1|6.7464121285...|
|     arbitrajes|a760a8f1cf3b...|2020-01-19 0...|851|  1|6.7464121285...|
|     asistiendo|722156625331...|2020-02-17 0...|851|  1|6.7464121285...|
|         brands|2371f5980a3f...|2020-

In [183]:
article_tfidf = article_tf.join(article_idf, ['article_id', 'palabra', 'publish_date'])\
    .withColumn('tf_idf', col('tf') * col('idf'))\
    .cache()

In [184]:
article_tfidf.orderBy('tf_idf', ascending=False).show(150,truncate=12)

+------------+------------+------------+---+---+------------+---+---+------------+------------+
|  article_id|     palabra|publish_date|n_w|n_d|          tf|c_d|i_d|         idf|      tf_idf|
+------------+------------+------------+---+---+------------+---+---+------------+------------+
|8605697c7...|    palenque|2020-02-0...|  1| 12|0.0833333...|851|  1|6.7464121...|0.5622010...|
|8605697c7...|       toros|2020-02-0...|  1| 12|0.0833333...|851|  2|6.0532649...|0.5044387...|
|8605697c7...|     barbosa|2020-02-0...|  1| 12|0.0833333...|851|  7|4.8005019...|0.4000418...|
|8605697c7...|       plaza|2020-02-0...|  1| 12|0.0833333...|851| 10|4.4438270...|0.3703189...|
|8605697c7...|     anuncia|2020-02-0...|  1| 12|0.0833333...|851| 11|4.3485168...|0.3623764...|
|8605697c7...|construcción|2020-02-0...|  1| 12|0.0833333...|851| 30|3.3452147...|0.2787678...|
|fe12cc2e5...|    barriles|2020-01-0...| 10|239|0.0418410...|851|  2|6.0532649...|0.2532746...|
|70ffaf20e...|     premium|2020-01-0...|

In [186]:
w = Window.partitionBy('article_id').orderBy(col('tf_idf').desc())

article_tfidf_top_15=article_tfidf.withColumn('rank', rank().over(w))\
    .where('rank <= 15')\
    .drop('rank')\
    .orderBy('article_id', 'tf_idf','n_w')\
    .select('article_id','publish_date','palabra','n_w','tf_idf')

In [187]:
article_tfidf_top_15.show(truncate=12, n=30)

+------------+------------+------------+---+------------+
|  article_id|publish_date|     palabra|n_w|      tf_idf|
+------------+------------+------------+---+------------+
|0000210f7...|2020-02-0...|  campamento|  2|0.0284859...|
|0000210f7...|2020-02-0...|      zuñiga|  2|0.0284859...|
|0000210f7...|2020-02-0...|   conjuntos|  2|0.0284859...|
|0000210f7...|2020-02-0...|     ascenso|  3|0.0285060...|
|0000210f7...|2020-02-0...|     academy|  2|0.0317478...|
|0000210f7...|2020-02-0...| selecciones|  2|0.0317478...|
|0000210f7...|2020-02-0...|     octavos|  2|0.0317478...|
|0000210f7...|2020-02-0...|     fenifut|  2|0.0317478...|
|0000210f7...|2020-02-0...|       tyron|  2|0.0317478...|
|0000210f7...|2020-02-0...|     jicaral|  2|0.0317478...|
|0000210f7...|2020-02-0...|       sub17|  2|0.0317478...|
|0000210f7...|2020-02-0...|     managua|  2|0.0317478...|
|0000210f7...|2020-02-0...|     acevedo|  2|0.0317478...|
|0000210f7...|2020-02-0...|       costa|  4|0.0357832...|
|0000210f7...|

In [198]:
article_tfidf_top_15_site = article_tfidf_top_15.join(df_news_covid_mexico_date_text, ['article_id','publish_date']).select('article_id','publish_date','site','palabra','n_w','tf_idf')

In [199]:
article_tfidf_top_15_site.show(15)

+--------------------+-------------------+-------------------+-----------+---+-------------------+
|          article_id|       publish_date|               site|    palabra|n_w|             tf_idf|
+--------------------+-------------------+-------------------+-----------+---+-------------------+
|0000210f7e639a06c...|2020-02-06 00:00:00|www.laprensa.com.ni| campamento|  2|0.02848595269653378|
|0000210f7e639a06c...|2020-02-06 00:00:00|www.laprensa.com.ni|  conjuntos|  2|0.02848595269653378|
|0000210f7e639a06c...|2020-02-06 00:00:00|www.laprensa.com.ni|     zuñiga|  2|0.02848595269653378|
|0000210f7e639a06c...|2020-02-06 00:00:00|www.laprensa.com.ni|    ascenso|  3| 0.0285060841939141|
|0000210f7e639a06c...|2020-02-06 00:00:00|www.laprensa.com.ni|    academy|  2|0.03174782178152176|
|0000210f7e639a06c...|2020-02-06 00:00:00|www.laprensa.com.ni|    fenifut|  2|0.03174782178152176|
|0000210f7e639a06c...|2020-02-06 00:00:00|www.laprensa.com.ni|    octavos|  2|0.03174782178152176|
|0000210f7

In [200]:
article_tfidf_top_15_site.write.jdbc(url=url, table="tb_news_covid_mexico_palabras_top_tfidf", mode=mode, properties=properties)