# Schema

In [55]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, LongType, IntegerType, BooleanType

types = {
    'Float': lambda: FloatType(),
    'Integer': lambda: LongType(),
    'Long': lambda: LongType(),
    'Bool': lambda: IntegerType()
}

feats = []
with open('CSV.header.txt') as header_file:
    for lineno, line in enumerate(header_file):
        line = line.strip()
        if ':' in line:
            feat_name, type_name = line.split(':')
            feats.append(StructField(feat_name, types[type_name](), True))
        else:
            feats.append(StructField(line, StringType(), True))
            
schema = StructType(feats)
feats

[StructField(GLOBALEVENTID,LongType,true),
 StructField(SQLDATE,LongType,true),
 StructField(MonthYear,LongType,true),
 StructField(Year,LongType,true),
 StructField(FractionDate,FloatType,true),
 StructField(Actor1Code,StringType,true),
 StructField(Actor1Name,StringType,true),
 StructField(Actor1CountryCode,StringType,true),
 StructField(Actor1KnownGroupCode,StringType,true),
 StructField(Actor1EthnicCode,StringType,true),
 StructField(Actor1Religion1Code,StringType,true),
 StructField(Actor1Religion2Code,StringType,true),
 StructField(Actor1Type1Code,StringType,true),
 StructField(Actor1Type2Code,StringType,true),
 StructField(Actor1Type3Code,StringType,true),
 StructField(Actor2Code,StringType,true),
 StructField(Actor2Name,StringType,true),
 StructField(Actor2CountryCode,StringType,true),
 StructField(Actor2KnownGroupCode,StringType,true),
 StructField(Actor2EthnicCode,StringType,true),
 StructField(Actor2Religion1Code,StringType,true),
 StructField(Actor2Religion2Code,StringType,

# Dataframe

In [56]:
df = spark.read.csv('/Users/mathieuclement/Downloads/gdelt/2015-2016/2016*.gz', sep = '\t', schema=schema)
df.take(1)

[Row(GLOBALEVENTID=597122373, SQLDATE=20151110, MonthYear=201511, Year=2015, FractionDate=2015.8492431640625, Actor1Code=None, Actor1Name=None, Actor1CountryCode=None, Actor1KnownGroupCode=None, Actor1EthnicCode=None, Actor1Religion1Code=None, Actor1Religion2Code=None, Actor1Type1Code=None, Actor1Type2Code=None, Actor1Type3Code=None, Actor2Code='BUS', Actor2Name='BOSS', Actor2CountryCode=None, Actor2KnownGroupCode=None, Actor2EthnicCode=None, Actor2Religion1Code=None, Actor2Religion2Code=None, Actor2Type1Code='BUS', Actor2Type2Code=None, Actor2Type3Code=None, IsRootEvent=1, EventCode='042', EventBaseCode='042', EventRootCode='04', QuadClass=1, GoldsteinScale=1.899999976158142, NumMentions=6, NumSources=1, NumArticles=6, AvgTone=0.8064516186714172, Actor1Geo_Type=0, Actor1Geo_FullName=None, Actor1Geo_CountryCode=None, Actor1Geo_ADM1Code=None, Actor1Geo_Lat=None, Actor1Geo_Long=None, Actor1Geo_FeatureID=None, Actor2Geo_Type=4, Actor2Geo_FullName='Sunshine Coast, Queensland, Australia', A

In [27]:
df.count()

139756517

In [28]:
df.createOrReplaceTempView("V_GDELT")

In [61]:
us_events = spark.sql('SELECT COUNT(ActionGeo_CountryCode) FROM V_GDELT WHERE ActionGeo_CountryCode="US"').collect()
us_events

[Row(count(ActionGeo_CountryCode)=5161731)]

In [30]:
df\
    .sample(False, .1)\
    .rdd\
    .filter(lambda row: row.ActionGeo_CountryCode == 'SZ' or row.Actor1CountryCode == 'CHE')\
    .take(10)

[Row(GLOBALEVENTID='597123443', SQLDATE='20161109', MonthYear='201611', Year='2016', FractionDate='2016.8466', Actor1Code='CHE', Actor1Name='GENEVA', Actor1CountryCode='CHE', Actor1KnownGroupCode=None, Actor1EthnicCode=None, Actor1Religion1Code=None, Actor1Religion2Code=None, Actor1Type1Code=None, Actor1Type2Code=None, Actor1Type3Code=None, Actor2Code='COP', Actor2Name='POLICE', Actor2CountryCode=None, Actor2KnownGroupCode=None, Actor2EthnicCode=None, Actor2Religion1Code=None, Actor2Religion2Code=None, Actor2Type1Code='COP', Actor2Type2Code=None, Actor2Type3Code=None, IsRootEvent='1', EventCode='090', EventBaseCode='090', EventRootCode='09', QuadClass='2', GoldsteinScale='-2.0', NumMentions='170', NumSources='16', NumArticles='170', AvgTone='-8.39536459861583', Actor1Geo_Type='4', Actor1Geo_FullName='Baghdad, Baghdad, Iraq', Actor1Geo_CountryCode='IZ', Actor1Geo_ADM1Code='IZ07', Actor1Geo_Lat='33.3386', Actor1Geo_Long='44.3939', Actor1Geo_FeatureID='-3103581', Actor2Geo_Type='4', Actor

In [59]:
def either_country_code(row, country1, country2):
    return (row.Actor1CountryCode == country1 and row.Actor2CountryCode == country2) or\
           (row.Actor1CountryCode == country2 and row.Actor2CountryCode == country1)

df\
    .rdd\
    .filter(lambda row: either_country_code(row, 'CHE', 'USA'))\
    .takeOrdered(10, key= lambda row: -row.NumMentions)

[Row(GLOBALEVENTID=576531412, SQLDATE=20160907, MonthYear=201609, Year=2016, FractionDate=2016.6767578125, Actor1Code='USAMED', Actor1Name='ASSOCIATED PRESS', Actor1CountryCode='USA', Actor1KnownGroupCode=None, Actor1EthnicCode=None, Actor1Religion1Code=None, Actor1Religion2Code=None, Actor1Type1Code='MED', Actor1Type2Code=None, Actor1Type3Code=None, Actor2Code='CHE', Actor2Name='GENEVA', Actor2CountryCode='CHE', Actor2KnownGroupCode=None, Actor2EthnicCode=None, Actor2Religion1Code=None, Actor2Religion2Code=None, Actor2Type1Code=None, Actor2Type2Code=None, Actor2Type3Code=None, IsRootEvent=0, EventCode='070', EventBaseCode='070', EventRootCode='07', QuadClass=2, GoldsteinScale=7.0, NumMentions=1528, NumSources=136, NumArticles=1528, AvgTone=-9.177913665771484, Actor1Geo_Type=4, Actor1Geo_FullName='Geneva, Genè, Switzerland', Actor1Geo_CountryCode='SZ', Actor1Geo_ADM1Code='SZ07', Actor1Geo_Lat=46.19559860229492, Actor1Geo_Long=6.1481099128723145, Actor1Geo_FeatureID=-2552151, Actor2Geo_

In [64]:
import re

url = 'http://www.politics.co.uk/comment-analysis/2015/01/06/comment-arms-sales-Trump-human-rights-as-uk-enters-bahrain'
url2 = 'http://www.thetrumpet.com/blabla'

pattern = re.compile('[^a-z]trump[^a-z]')
print(pattern.search(url.lower()) )
print(pattern.search(url2.lower()))

df\
    .rdd\
    .filter(lambda row: row.SOURCEURL)\
    .count()

<_sre.SRE_Match object; span=(72, 79), match='-trump-'>
None
