# Schema

In [47]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, LongType, IntegerType, BooleanType

types = {
    'Float': lambda: FloatType(),
    'Integer': lambda: IntegerType(),
    'Long': lambda: LongType(),
    'Bool': lambda: IntegerType()
}

feats = []
with open('/Users/mathieuclement/Downloads/gdelt/CSV.header.txt') as header_file:
    for lineno, line in enumerate(header_file):
        line = line.strip()
        if ':' in line:
            feat_name, type_name = line.split(':')
            feats.append(StructField(feat_name, types[type_name](), True))
        else:
            feats.append(StructField(line, StringType(), True))
            
schema = StructType(feats)
feats

[StructField(GLOBALEVENTID,LongType,true),
 StructField(SQLDATE,LongType,true),
 StructField(MonthYear,IntegerType,true),
 StructField(Year,IntegerType,true),
 StructField(FractionDate,FloatType,true),
 StructField(Actor1Code,StringType,true),
 StructField(Actor1Name,StringType,true),
 StructField(Actor1CountryCode,StringType,true),
 StructField(Actor1KnownGroupCode,StringType,true),
 StructField(Actor1EthnicCode,StringType,true),
 StructField(Actor1Religion1Code,StringType,true),
 StructField(Actor1Religion2Code,StringType,true),
 StructField(Actor1Type1Code,StringType,true),
 StructField(Actor1Type2Code,StringType,true),
 StructField(Actor1Type3Code,StringType,true),
 StructField(Actor2Code,StringType,true),
 StructField(Actor2Name,StringType,true),
 StructField(Actor2CountryCode,StringType,true),
 StructField(Actor2KnownGroupCode,StringType,true),
 StructField(Actor2EthnicCode,StringType,true),
 StructField(Actor2Religion1Code,StringType,true),
 StructField(Actor2Religion2Code,Strin

# Dataframe

In [49]:
df = spark.read.csv('/Users/mathieuclement/Downloads/gdelt/*.export.csv', sep = '\t', schema=schema)
df.take(1)

[Row(GLOBALEVENTID=704806527, SQLDATE=20071111, MonthYear=200711, Year=2007, FractionDate=2007.85205078125, Actor1Code=None, Actor1Name=None, Actor1CountryCode=None, Actor1KnownGroupCode=None, Actor1EthnicCode=None, Actor1Religion1Code=None, Actor1Religion2Code=None, Actor1Type1Code=None, Actor1Type2Code=None, Actor1Type3Code=None, Actor2Code='SAU', Actor2Name='SAUDI ARABIA', Actor2CountryCode='SAU', Actor2KnownGroupCode=None, Actor2EthnicCode=None, Actor2Religion1Code=None, Actor2Religion2Code=None, Actor2Type1Code=None, Actor2Type2Code=None, Actor2Type3Code=None, IsRootEvent=1, EventCode='190', EventBaseCode='190', EventRootCode='19', QuadClass=4, GoldsteinScale=-10.0, NumMentions=708, NumSources=54, NumArticles=708, AvgTone=-5.323681831359863, Actor1Geo_Type=0, Actor1Geo_FullName=None, Actor1Geo_CountryCode=None, Actor1Geo_ADM1Code=None, Actor1Geo_Lat=None, Actor1Geo_Long=None, Actor1Geo_FeatureID=None, Actor2Geo_Type=4, Actor2Geo_FullName='Riyadh, Ar Riya?, Saudi Arabia', Actor2Geo

In [50]:
df.count()

15902791

In [51]:
df.createOrReplaceTempView("V_GDELT")

In [61]:
us_events = spark.sql('SELECT COUNT(ActionGeo_CountryCode) FROM V_GDELT WHERE ActionGeo_CountryCode="US"').collect()
us_events

[Row(count(ActionGeo_CountryCode)=5161731)]

In [63]:
df\
    .sample(False, .1)\
    .rdd\
    .filter(lambda row: row.ActionGeo_CountryCode == 'SZ')\
    .take(10)

[Row(GLOBALEVENTID=704856482, SQLDATE=20171108, MonthYear=201711, Year=2017, FractionDate=2017.84375, Actor1Code='CHE', Actor1Name='SWITZERLAND', Actor1CountryCode='CHE', Actor1KnownGroupCode=None, Actor1EthnicCode=None, Actor1Religion1Code=None, Actor1Religion2Code=None, Actor1Type1Code=None, Actor1Type2Code=None, Actor1Type3Code=None, Actor2Code='GOV', Actor2Name='GOVERNMENT', Actor2CountryCode=None, Actor2KnownGroupCode=None, Actor2EthnicCode=None, Actor2Religion1Code=None, Actor2Religion2Code=None, Actor2Type1Code='GOV', Actor2Type2Code=None, Actor2Type3Code=None, IsRootEvent=1, EventCode='020', EventBaseCode='020', EventRootCode='02', QuadClass=1, GoldsteinScale=3.0, NumMentions=4, NumSources=1, NumArticles=4, AvgTone=0.11976047605276108, Actor1Geo_Type=4, Actor1Geo_FullName='Bern, Bern, Switzerland', Actor1Geo_CountryCode='SZ', Actor1Geo_ADM1Code='SZ05', Actor1Geo_Lat=46.91669845581055, Actor1Geo_Long=7.466670036315918, Actor1Geo_FeatureID=-2551235, Actor2Geo_Type=4, Actor2Geo_Fu