In [48]:
# Add your imports here
import pandas as pd
import numpy as np
import scipy as sp
%matplotlib inline
import matplotlib.pyplot as plt

import findspark
findspark.init()

from pyspark.sql import *
import pyspark.sql.functions as sf
import re

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns 
from scipy.stats import pearsonr

In [49]:
# create the session
spark = SparkSession.builder.getOrCreate()
# create the context
sc = spark.sparkContext

In [50]:
country_filter = spark.read.parquet("filter_country.parquet")

In [51]:
country_filter.show()

+-------+------+
|country| count|
+-------+------+
|     IN|318680|
|     JM| 24705|
|     AU|169433|
|     PK| 45024|
|     SG| 32116|
|     NG| 37824|
|     TZ|  7044|
|     CA|433496|
|     US|483595|
|     GB|471916|
|     MY| 29945|
|     ZA|155345|
|     LK|  8295|
|     BD| 12225|
|     HK|  5498|
|     NZ| 77504|
|     GH| 18439|
|     IE|145216|
|     KE| 23016|
|     PH| 65937|
+-------+------+



In [52]:
data = spark.read.parquet("data.parquet")

In [53]:
data.schema

StructType(List(StructField(textID,IntegerType,true),StructField(#words,IntegerType,true),StructField(date,DateType,true),StructField(country,StringType,true),StructField(website,StringType,true),StructField(url,StringType,true),StructField(title,StringType,true)))

In [54]:
data.show()

+-------+------+----------+-------+--------------------+--------------------+--------------------+
| textID|#words|      date|country|             website|                 url|               title|
+-------+------+----------+-------+--------------------+--------------------+--------------------+
|1334669|   334|0010-01-01|     US|        The Next Web|http://thenextweb...|Believe it or not...|
|1334671|   493|0010-01-01|     US|     People Magazine|http://www.people...|INSIDE STORY: The...|
|1334672|  1255|0010-01-01|     US|San Francisco Chr...|http://www.sfgate...|Biblical scholar'...|
|1334673|   695|0010-01-01|     US|                 CNN|http://www.cnn.co...|What you need to ...|
|1334674|   724|0010-01-01|     US|       MedPage Today|http://www.medpag...|Lack of Sleep Lin...|
|1334676|  1418|0010-01-01|     US|Cracked.com (sati...|http://www.cracke...|5 Real Buried Tre...|
|1334678|  1247|0010-01-01|     US|      Common Dreams |http://www.common...|The Real Top Ten ...|
|1334679| 

In [55]:
country_urls = spark.read.parquet("filter_urls.parquet")
country_urls.show()

+-------+------+
|country| count|
+-------+------+
|     IN|318680|
|     JM| 24705|
|     AU|169433|
|     PK| 45024|
|     IE|145216|
|     HK|  5498|
|     SG| 32116|
|     GH| 18439|
|     PH| 65937|
|     ZA|155345|
|     NG| 37824|
|     US|483595|
|     MY| 29945|
|     LK|  8295|
|     CA|433496|
|     TZ|  7044|
|     NZ| 77504|
|     GB|471916|
|     KE| 23016|
|     BD| 12225|
+-------+------+



In [56]:
country_sources = spark.read.parquet("filter_sources.parquet")
country_sources.show()

+-------+--------------------+-----+
|country|             website|count|
+-------+--------------------+-----+
|     US|            Benzinga|   19|
|     US|      24/7 Wall St. |   62|
|     US|Film School Rejects |   79|
|     US|McCook Daily Gazette|    4|
|     US|        HTCPedia.com|    1|
|     US| Pro Wrestling Torch|   13|
|     US|       The Batavian |   32|
|     US|Broomfield Enterp...|    2|
|     US|Du Quoin Evening ...|    2|
|     US|          MovieViral|    2|
|     US|    Access Hollywood|  115|
|     US|    Sun Star Courier|   84|
|     US|Motorcycle Cruise...|   15|
|     US|   Assembly Magazine|    2|
|     US|         Labor Notes|    7|
|     US|           ADDitude |   11|
|     US|Shakin The Southl...|   14|
|     US|University of Roc...|   48|
|     US|   Metal Underground|    3|
|     US|Charleston Post C...|  166|
+-------+--------------------+-----+
only showing top 20 rows



In [57]:
country_words = spark.read.parquet("filter_words.parquet")
country_words.show()

+-------+-----------------+
|country|words_per_country|
+-------+-----------------+
|     AU|        114973759|
|     PK|         29278292|
|     IN|        175954781|
|     JM|         16254579|
|     NZ|         39611915|
|     BD|          7463981|
|     KE|         10773652|
|     NG|         31525542|
|     IE|         81855784|
|     ZA|         93694663|
|     CA|        303942942|
|     GB|        323357618|
|     MY|         18090037|
|     SG|         13646677|
|     TZ|          3967575|
|     PH|         35888251|
|     US|        430419515|
|     LK|          5832367|
|     GH|         11884521|
|     HK|          4068451|
+-------+-----------------+

