In [2]:
import findspark

In [3]:
findspark.init('/spark')

In [4]:
from pyspark.sql import SparkSession as SS

In [5]:
spark = SS.builder.appName('newscombination').getOrCreate()

In [6]:
news_category_df = spark.read.json('News_Category_Dataset.json')

In [7]:
news_category_df.show(5)

+---------------+-------------+----------+--------------------+--------------------+--------------------+
|        authors|     category|      date|            headline|                link|   short_description|
+---------------+-------------+----------+--------------------+--------------------+--------------------+
|Melissa Jeltsen|        CRIME|2018-05-26|There Were 2 Mass...|https://www.huffi...|She left her husb...|
|  Andy McDonald|ENTERTAINMENT|2018-05-26|Will Smith Joins ...|https://www.huffi...|Of course it has ...|
|     Ron Dicker|ENTERTAINMENT|2018-05-26|Hugh Grant Marrie...|https://www.huffi...|The actor and his...|
|     Ron Dicker|ENTERTAINMENT|2018-05-26|Jim Carrey Blasts...|https://www.huffi...|The actor gives D...|
|     Ron Dicker|ENTERTAINMENT|2018-05-26|Julianna Margulie...|https://www.huffi...|The "Dietland" ac...|
+---------------+-------------+----------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [8]:
from pyspark.sql.functions import col

In [9]:
news_category_df.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------+-----+
|      category|count|
+--------------+-----+
|      POLITICS|32739|
| ENTERTAINMENT|14257|
|HEALTHY LIVING| 6694|
|  QUEER VOICES| 4995|
|      BUSINESS| 4254|
|        SPORTS| 4167|
|        COMEDY| 3971|
|       PARENTS| 3955|
|  BLACK VOICES| 3858|
| THE WORLDPOST| 3664|
|         WOMEN| 3490|
|         CRIME| 2893|
|         MEDIA| 2815|
|    WEIRD NEWS| 2670|
|         GREEN| 2622|
|        IMPACT| 2602|
|     WORLDPOST| 2579|
|      RELIGION| 2556|
|         STYLE| 2254|
|    WORLD NEWS| 2177|
+--------------+-----+
only showing top 20 rows



In [10]:
politics_news = news_category_df.filter("category = 'POLITICS'").select('headline','category')

In [11]:
politics_news = politics_news.withColumnRenamed('headline','text').withColumnRenamed('category','label')

In [12]:
uci_news = spark.read.csv('uci-news-aggregator.csv',inferSchema=True, header=True)

In [13]:
uci_news.show(5)

+---+--------------------+--------------------+-----------------+--------+--------------------+-------------------+-------------+
| ID|               TITLE|                 URL|        PUBLISHER|CATEGORY|               STORY|           HOSTNAME|    TIMESTAMP|
+---+--------------------+--------------------+-----------------+--------+--------------------+-------------------+-------------+
|  1|Fed official says...|http://www.latime...|Los Angeles Times|       b|ddUyU0VZz0BRneMio...|    www.latimes.com|1394470370698|
|  2|Fed's Charles Plo...|http://www.livemi...|         Livemint|       b|ddUyU0VZz0BRneMio...|   www.livemint.com|1394470371207|
|  3|US open: Stocks f...|http://www.ifamag...|     IFA Magazine|       b|ddUyU0VZz0BRneMio...|www.ifamagazine.com|1394470371550|
|  4|Fed risks falling...|http://www.ifamag...|     IFA Magazine|       b|ddUyU0VZz0BRneMio...|www.ifamagazine.com|1394470371793|
|  5|Fed's Plosser: Na...|http://www.moneyn...|        Moneynews|       b|ddUyU0VZz0BRneMi

In [14]:
uci_news.groupBy("CATEGORY") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+------+
|            CATEGORY| count|
+--------------------+------+
|                   e|152127|
|                   b|115935|
|                   t|108237|
|                   m| 45616|
|                null|   516|
|         Us Magazine|    31|
|    Contactmusic.com|    20|
|           GossipCop|    20|
|         Complex.com|    12|
|            CBS News|    12|
|The Hollywood Gossip|    11|
|            HipHopDX|    11|
|  HeadlinePlanet.com|    10|
| We Got This Covered|    10|
|             Gamepur|     8|
|   WorstPreviews.com|     7|
|          TooFab.com|     7|
|Consequence of Sound|     7|
|            Wetpaint|     7|
|        The Escapist|     6|
+--------------------+------+
only showing top 20 rows



In [15]:
uci_news = uci_news.select('TITLE','CATEGORY')

In [16]:
uci_news = uci_news.withColumnRenamed('TITLE','text').withColumnRenamed('CATEGORY','label')

In [17]:
uci_news.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+------+
|               label| count|
+--------------------+------+
|                   e|152127|
|                   b|115935|
|                   t|108237|
|                   m| 45616|
|                null|   516|
|         Us Magazine|    31|
|    Contactmusic.com|    20|
|           GossipCop|    20|
|            CBS News|    12|
|         Complex.com|    12|
|            HipHopDX|    11|
|The Hollywood Gossip|    11|
| We Got This Covered|    10|
|  HeadlinePlanet.com|    10|
|             Gamepur|     8|
|          TooFab.com|     7|
|Consequence of Sound|     7|
|   WorstPreviews.com|     7|
|            Wetpaint|     7|
|        The Escapist|     6|
+--------------------+------+
only showing top 20 rows



In [18]:
uci_news = uci_news.filter("label = 'e' or label = 'b' or label = 't' or label = 'm'")

In [19]:
uci_news.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-----+------+
|label| count|
+-----+------+
|    e|152127|
|    b|115935|
|    t|108237|
|    m| 45616|
+-----+------+



In [20]:
import functools 

In [21]:
def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs) 

In [22]:
uci_news.printSchema()

root
 |-- text: string (nullable = true)
 |-- label: string (nullable = true)



In [23]:
politics_news.printSchema()

root
 |-- text: string (nullable = true)
 |-- label: string (nullable = true)



In [24]:
combined_news = unionAll([uci_news,politics_news])

In [25]:
combined_news.printSchema()

root
 |-- text: string (nullable = true)
 |-- label: string (nullable = true)



In [26]:
combined_news.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+------+
|   label| count|
+--------+------+
|       e|152127|
|       b|115935|
|       t|108237|
|       m| 45616|
|POLITICS| 32739|
+--------+------+



In [27]:
entertainment_news = combined_news.filter("label = 'e'")

In [28]:
business_news = combined_news.filter("label = 'b'")

In [29]:
science_and_technology_news = combined_news.filter("label = 't'")

In [30]:
health_news = combined_news.filter("label = 'm'")

In [31]:
entertainment_news.createOrReplaceTempView('e')

In [32]:
entertainment_news = spark.sql("select text,'ENTERTAINMENT' as label from e")

In [33]:
business_news.createOrReplaceTempView('b')

In [34]:
business_news = spark.sql("select text,'BUSINESS' as label from b")

In [35]:
science_and_technology_news.createOrReplaceTempView('t')

In [36]:
science_and_technology_news = spark.sql("select text,'SCIENCETECHNOLOGY' as label from t")

In [37]:
health_news.createOrReplaceTempView('m')

In [38]:
health_news = spark.sql("select text,'HEALTH' as label from m")

In [39]:
news_with_corrected_labels = unionAll([health_news,entertainment_news,science_and_technology_news,business_news,politics_news])

In [40]:
news_with_corrected_labels.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-----------------+------+
|            label| count|
+-----------------+------+
|    ENTERTAINMENT|152127|
|         BUSINESS|115935|
|SCIENCETECHNOLOGY|108237|
|           HEALTH| 45616|
|         POLITICS| 32739|
+-----------------+------+



In [41]:
training, test = news_with_corrected_labels.randomSplit([0.7,0.3])

In [42]:
training.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-----------------+------+
|            label| count|
+-----------------+------+
|    ENTERTAINMENT|106631|
|         BUSINESS| 81271|
|SCIENCETECHNOLOGY| 75802|
|           HEALTH| 32017|
|         POLITICS| 22962|
+-----------------+------+



In [43]:
test.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-----------------+-----+
|            label|count|
+-----------------+-----+
|    ENTERTAINMENT|45496|
|         BUSINESS|34664|
|SCIENCETECHNOLOGY|32435|
|           HEALTH|13599|
|         POLITICS| 9777|
+-----------------+-----+



In [44]:
training_list = training.collect()

In [45]:
test_list = test.collect()

In [46]:
for item in training_list:
    label = '__label__'+item['label']
    label = label.replace(' ', '')
    text = item['text']
    the_string = label+' '+text+'\n'
    with open("combinednews.train", "a") as myfile:
        myfile.write(the_string)

In [47]:
for item in test_list:
    label = '__label__'+item['label']
    label = label.replace(' ', '')
    text = item['text']
    the_string = label+' '+text+'\n'
    with open("combinednews.test", "a") as myfile:
        myfile.write(the_string)

In [48]:
###### this combination is just the combination of uci news and politics from the other set
#### acuracy: 90%

In [49]:
news_category_df.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------+-----+
|      category|count|
+--------------+-----+
|      POLITICS|32739|
| ENTERTAINMENT|14257|
|HEALTHY LIVING| 6694|
|  QUEER VOICES| 4995|
|      BUSINESS| 4254|
|        SPORTS| 4167|
|        COMEDY| 3971|
|       PARENTS| 3955|
|  BLACK VOICES| 3858|
| THE WORLDPOST| 3664|
|         WOMEN| 3490|
|         CRIME| 2893|
|         MEDIA| 2815|
|    WEIRD NEWS| 2670|
|         GREEN| 2622|
|        IMPACT| 2602|
|     WORLDPOST| 2579|
|      RELIGION| 2556|
|         STYLE| 2254|
|    WORLD NEWS| 2177|
+--------------+-----+
only showing top 20 rows



In [50]:
sports_news = news_category_df.filter("category = 'SPORTS'")

In [51]:
sports_news = sports_news.select('category','headline')

In [52]:
sports_news = sports_news.withColumnRenamed('category','label').withColumnRenamed('headline','text')

In [53]:
sports_news = sports_news.select('text','label')

In [54]:
labeled_news_with_sports = unionAll([news_with_corrected_labels,sports_news])

In [55]:
labeled_news_with_sports.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-----------------+------+
|            label| count|
+-----------------+------+
|    ENTERTAINMENT|152127|
|         BUSINESS|115935|
|SCIENCETECHNOLOGY|108237|
|           HEALTH| 45616|
|         POLITICS| 32739|
|           SPORTS|  4167|
+-----------------+------+



In [56]:
training2, test2 = labeled_news_with_sports.randomSplit([0.7,0.3])

In [57]:
training2_list = training2.collect()

In [58]:
test2_list = test2.collect()

In [59]:
for item in training2_list:
    label = '__label__'+item['label']
    label = label.replace(' ', '')
    text = item['text']
    the_string = label+' '+text+'\n'
    with open("combinednews2.train", "a") as myfile:
        myfile.write(the_string)

In [60]:
for item in test2_list:
    label = '__label__'+item['label']
    label = label.replace(' ', '')
    text = item['text']
    the_string = label+' '+text+'\n'
    with open("combinednews2.test", "a") as myfile:
        myfile.write(the_string)

Now we use fasttext:
------------------------------
./fasttext supervised -input /path/to/combinednews2.train -output model_combinednews2 <br/>
Read 3M words <br/>
Number of words:  159993 <br/>
Number of labels: 6 <br/>
Lets see how good our model is!
-----------------------------------------------
./fasttext test model_combinednews2.bin /path/to/combinednews2.test <br/>
N	137477 <br/>
P@1	0.92 <br/>
R@1	0.92 <br/>
Number of examples: 137477 <br/>
./fasttext test model_combinednews2.bin /path/to/combinednews2.train <br/>
N	321344 <br/>
P@1	0.961 <br/>
R@1	0.961 <br/>
Number of examples: 321344 <br/>

In [61]:
import fastText

In [62]:
model = fastText.load_model('model_combinednews.bin')

In [69]:
prediction = model.predict(['Rescuing The Rescuers: Stranded Syrian White Helmets Evacuated By Israel','The Real Hero Of The British Open Is Eddie Pepperell, Who Played His Final Round Hungover'])

In [70]:
print(prediction)

([['__label__POLITICS'], ['__label__SPORTS']], array([[0.92183721],
       [0.54315639]]))
