In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("News").getOrCreate()
sc    = spark.sparkContext      # get the context
spark
sc

In [2]:
# Example uses GDELT dataset found here: https://aws.amazon.com/public-datasets/gdelt/
# Column headers found here: http://gdeltproject.org/data/lookups/CSV.header.dailyupdates.txt

# Load RDD
lines = sc.textFile("s3://gdelt-open-data/events/2016*") # Loads 73,385,698 records from 2016

In [3]:
# Split lines into columns; change split() argument depending on deliminiter e.g. '\t'
parts = lines.map(lambda l: l.split('\t'))

In [5]:
# Convert RDD into DataFrame
from urllib.request import urlopen

In [48]:
html = sc.textFile("http://gdeltproject.org/data/lookups/CSV.header.dailyupdates.txt")

In [50]:
columns = html.map(lambda l: l.split('\t'))

In [73]:
df = parts.toDF(schema=["GLOBALEVENTID", "SQLDATE", 'MonthYear','Year','FractionDate','Actor1Code','Actor1Name','Actor1CountryCode',
                'Actor1KnownGroupCode','Actor1EthnicCode',
               'Actor1Religion1Code','Actor1Religion2Code','Actor1Type1Code','Actor1Type2Code','Actor1Type3Code','Actor2Code',
                'Actor2Name','Actor2CountryCode',
               'Actor2KnownGroupCode','Actor2EthnicCode','Actor2Religion1Code','Actor2Religion2Code','Actor2Type1Code',
                'Actor2Type2Code','Actor2Type3Code','IsRootEvent','EventCode',
               'EventBaseCode','EventRootCode','QuadClass','GoldsteinScale','NumMentions','NumSources','NumArticles',
                'AvgTone','Actor1Geo_Type','Actor1Geo_FullName','Actor1Geo_CountryCode',
               'Actor1Geo_ADM1Code','Actor1Geo_Lat','Actor1Geo_Long','Actor1Geo_FeatureID','Actor2Geo_Type','Actor2Geo_FullName',
                'Actor2Geo_CountryCode','Actor2Geo_ADM1Code','Actor2Geo_Lat',
               'Actor2Geo_Long','Actor2Geo_FeatureID','ActionGeo_Type','ActionGeo_FullName','ActionGeo_CountryCode',
                'ActionGeo_ADM1Code','ActionGeo_Lat','ActionGeo_Long','ActionGeo_FeatureID','DATEADDED','SOURCEURL'])

In [75]:
df.take(1)

[Row(GLOBALEVENTID='498554164', SQLDATE='20060103', MonthYear='200601', Year='2006', FractionDate='2006.0082', Actor1Code='BUS', Actor1Name='CORPORATION', Actor1CountryCode='', Actor1KnownGroupCode='', Actor1EthnicCode='', Actor1Religion1Code='', Actor1Religion2Code='', Actor1Type1Code='BUS', Actor1Type2Code='', Actor1Type3Code='', Actor2Code='', Actor2Name='', Actor2CountryCode='', Actor2KnownGroupCode='', Actor2EthnicCode='', Actor2Religion1Code='', Actor2Religion2Code='', Actor2Type1Code='', Actor2Type2Code='', Actor2Type3Code='', IsRootEvent='1', EventCode='051', EventBaseCode='051', EventRootCode='05', QuadClass='1', GoldsteinScale='3.4', NumMentions='6', NumSources='2', NumArticles='6', AvgTone='1.72415843232381', Actor1Geo_Type='4', Actor1Geo_FullName='American Creek, British Columbia, Canada', Actor1Geo_CountryCode='CA', Actor1Geo_ADM1Code='CA02', Actor1Geo_Lat='49.05', Actor1Geo_Long='-116', Actor1Geo_FeatureID='-560119', Actor2Geo_Type='0', Actor2Geo_FullName='', Actor2Geo_Co

In [76]:
df.count()

73385698

In [78]:
df.printSchema()

root
 |-- GLOBALEVENTID: string (nullable = true)
 |-- SQLDATE: string (nullable = true)
 |-- MonthYear: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- FractionDate: string (nullable = true)
 |-- Actor1Code: string (nullable = true)
 |-- Actor1Name: string (nullable = true)
 |-- Actor1CountryCode: string (nullable = true)
 |-- Actor1KnownGroupCode: string (nullable = true)
 |-- Actor1EthnicCode: string (nullable = true)
 |-- Actor1Religion1Code: string (nullable = true)
 |-- Actor1Religion2Code: string (nullable = true)
 |-- Actor1Type1Code: string (nullable = true)
 |-- Actor1Type2Code: string (nullable = true)
 |-- Actor1Type3Code: string (nullable = true)
 |-- Actor2Code: string (nullable = true)
 |-- Actor2Name: string (nullable = true)
 |-- Actor2CountryCode: string (nullable = true)
 |-- Actor2KnownGroupCode: string (nullable = true)
 |-- Actor2EthnicCode: string (nullable = true)
 |-- Actor2Religion1Code: string (nullable = true)
 |-- Actor2Religion2Code: stri

In [79]:
# register a dataframe before running the sql
df.createOrReplaceTempView("df")

In [84]:
df.take(5)

[Row(GLOBALEVENTID='498554164', SQLDATE='20060103', MonthYear='200601', Year='2006', FractionDate='2006.0082', Actor1Code='BUS', Actor1Name='CORPORATION', Actor1CountryCode='', Actor1KnownGroupCode='', Actor1EthnicCode='', Actor1Religion1Code='', Actor1Religion2Code='', Actor1Type1Code='BUS', Actor1Type2Code='', Actor1Type3Code='', Actor2Code='', Actor2Name='', Actor2CountryCode='', Actor2KnownGroupCode='', Actor2EthnicCode='', Actor2Religion1Code='', Actor2Religion2Code='', Actor2Type1Code='', Actor2Type2Code='', Actor2Type3Code='', IsRootEvent='1', EventCode='051', EventBaseCode='051', EventRootCode='05', QuadClass='1', GoldsteinScale='3.4', NumMentions='6', NumSources='2', NumArticles='6', AvgTone='1.72415843232381', Actor1Geo_Type='4', Actor1Geo_FullName='American Creek, British Columbia, Canada', Actor1Geo_CountryCode='CA', Actor1Geo_ADM1Code='CA02', Actor1Geo_Lat='49.05', Actor1Geo_Long='-116', Actor1Geo_FeatureID='-560119', Actor2Geo_Type='0', Actor2Geo_FullName='', Actor2Geo_Co

## Data Cleansing

In [87]:
# GoldsteinScale, AvgTone, Actor1Geo_Lat, Actor1Geo_Long should be decimals
df = df.withColumn("GoldsteinScale",df["GoldsteinScale"].cast('float'))
df = df.withColumn("AvgTone",df["AvgTone"].cast('float'))  
df = df.withColumn("Actor1Geo_Lat",df["Actor1Geo_Lat"].cast('float'))  
df = df.withColumn("Actor1Geo_Long",df["Actor1Geo_Long"].cast('float'))

In [88]:
from pyspark.sql.types import IntegerType

# NumMentions, NumSources, NumArticles should be integer
df = df.withColumn("NumMentions", df["NumMentions"].cast(IntegerType()))
df = df.withColumn("NumSources", df["NumSources"].cast(IntegerType()))
df = df.withColumn("NumArticles", df["NumArticles"].cast(IntegerType()))

In [None]:
# no need to remove missing values for analysis because the event happend anyways and maybe there is only 1 actor
# by may consider remove missing values for models building. To remove:
# df.na.drop()

In [90]:
# check for missing values in each column
from pyspark.sql.functions import isnan, when, count, col

df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

+-------------+-------+---------+----+------------+----------+----------+-----------------+--------------------+----------------+-------------------+-------------------+---------------+---------------+---------------+----------+----------+-----------------+--------------------+----------------+-------------------+-------------------+---------------+---------------+---------------+-----------+---------+-------------+-------------+---------+--------------+-----------+----------+-----------+-------+--------------+------------------+---------------------+------------------+-------------+--------------+-------------------+--------------+------------------+---------------------+------------------+-------------+--------------+-------------------+--------------+------------------+---------------------+------------------+-------------+--------------+-------------------+---------+---------+
|GLOBALEVENTID|SQLDATE|MonthYear|Year|FractionDate|Actor1Code|Actor1Name|Actor1CountryCode|Actor1KnownGrou

In [92]:
# gdelt is actually a library, so can install and import it
# but please keep using the df loaded earlier as the dataset, and do not use this dataset directly
# because I dont think this met Marck's requirement as large dataset
# instead you can use it guide the analysis since it's easier to navigate
pip install gdelt

Collecting gdelt
  Downloading gdelt-0.1.10.6.1-py2.py3-none-any.whl (773 kB)
[K     |████████████████████████████████| 773 kB 40.1 MB/s eta 0:00:01
Installing collected packages: gdelt
Successfully installed gdelt-0.1.10.6
Note: you may need to restart the kernel to use updated packages.


In [93]:
import gdelt

gd = gdelt.gdelt(version=2)

results = gd.Search(['2016 10 19','2016 10 22'],table='events',coverage=True,translation=False)

In [95]:
results.head(10)

Unnamed: 0,GLOBALEVENTID,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,...,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_ADM2Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
0,590338235,20151020,201510,2015,2015.7945,,,,,,...,4,"Tambaram, Tamil Nadu, India",IN,IN25,70244,12.9272,80.1106,-2112548,20161019003000,http://www.thehindu.com/news/cities/chennai/de...
1,590338236,20151020,201510,2015,2015.7945,AUS,AUSTRALIAN,AUS,,,...,4,"Kuala Lumpur, Kuala Lumpur, Malaysia",MY,MY14,18585,3.16667,101.7,-2403010,20161019003000,http://www.malaysiakini.com/news/359575
2,590338237,20151020,201510,2015,2015.7945,AUS,AUSTRALIAN,AUS,,,...,4,"Ulaanbaatar, Ulaanbaatar, Mongolia",MG,MG20,123600,47.9167,106.917,-2353539,20161019003000,http://www.malaysiakini.com/news/359575
3,590338238,20151020,201510,2015,2015.7945,BUS,BANK,,,,...,5,"Tamil Nadu, Tamil Nadu, India",IN,IN25,70248,11.0,78.0,-2112557,20161019003000,http://www.thehindu.com/news/cities/chennai/de...
4,590338239,20151020,201510,2015,2015.7945,BUS,BANK,,,,...,4,"Tambaram, Tamil Nadu, India",IN,IN25,70244,12.9272,80.1106,-2112548,20161019003000,http://www.thehindu.com/news/cities/chennai/de...
5,590338240,20151020,201510,2015,2015.7945,GOVHLH,HEALTH DEPARTMENT,,,,...,4,"Delhi, Delhi, India",IN,IN07,17911,28.6667,77.2167,-2094230,20161019003000,http://www.thehindu.com/news/cities/Delhi/dcw-...
6,590338241,20151020,201510,2015,2015.7945,IND,DELHI,IND,,,...,4,"Delhi, Delhi, India",IN,IN07,17911,28.6667,77.2167,-2094230,20161019003000,http://www.thehindu.com/news/cities/Delhi/dcw-...
7,590338242,20151020,201510,2015,2015.7945,INDGOV,DELHI,IND,,,...,4,"Delhi, Delhi, India",IN,IN07,17911,28.6667,77.2167,-2094230,20161019003000,http://www.thehindu.com/news/cities/Delhi/dcw-...
8,590338243,20151020,201510,2015,2015.7945,JUDJUD,SUPERIOR COURT,,,,...,2,"Connecticut, United States",US,USCT,,41.5834,-72.7622,CT,20161019003000,http://marcellus.com/news/id/140165/court-rule...
9,590338244,20151020,201510,2015,2015.7945,MNCUSAMED,GOOGLE,USA,,,...,3,"Philadelphia, Pennsylvania, United States",US,USPA,PA101,39.9523,-75.1638,1209052,20161019003000,http://wisconsingazette.com/2016/10/18/robots-...


In [89]:
# EventCode is the raw CAMEO action code describing the action that Actor1 performed upon Actor2
spark.sql("""SELECT QuadClass, count(*) as count from df 
            group by QuadClass order by count desc limit 10""").show()

+---------+--------+
|QuadClass|   count|
+---------+--------+
|        1|44668851|
|        4|10784200|
|        3| 9542781|
|        2| 8389866|
+---------+--------+

