In [1]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, LongType, IntegerType, BooleanType

types = {
    'Float': lambda: FloatType(),
    'Integer': lambda: LongType(),
    'Long': lambda: LongType(),
    'Bool': lambda: IntegerType()
}

feats = []
with open('../CSV.header.txt') as header_file:
    for lineno, line in enumerate(header_file):
        line = line.strip()
        if ':' in line:
            feat_name, type_name = line.split(':')
            feats.append(StructField(feat_name, types[type_name](), True))
        else:
            feats.append(StructField(line, StringType(), True))
            
schema = StructType(feats)
feats

[StructField(GLOBALEVENTID,LongType,true),
 StructField(SQLDATE,LongType,true),
 StructField(MonthYear,LongType,true),
 StructField(Year,LongType,true),
 StructField(FractionDate,FloatType,true),
 StructField(Actor1Code,StringType,true),
 StructField(Actor1Name,StringType,true),
 StructField(Actor1CountryCode,StringType,true),
 StructField(Actor1KnownGroupCode,StringType,true),
 StructField(Actor1EthnicCode,StringType,true),
 StructField(Actor1Religion1Code,StringType,true),
 StructField(Actor1Religion2Code,StringType,true),
 StructField(Actor1Type1Code,StringType,true),
 StructField(Actor1Type2Code,StringType,true),
 StructField(Actor1Type3Code,StringType,true),
 StructField(Actor2Code,StringType,true),
 StructField(Actor2Name,StringType,true),
 StructField(Actor2CountryCode,StringType,true),
 StructField(Actor2KnownGroupCode,StringType,true),
 StructField(Actor2EthnicCode,StringType,true),
 StructField(Actor2Religion1Code,StringType,true),
 StructField(Actor2Religion2Code,StringType,

In [2]:
df = spark.read.csv('/home/ubuntu/data-p3-d2/2015-unzip/2015*.gz', sep = '\t', schema=schema)

In [3]:
df.createOrReplaceTempView("GDELT")

In [None]:
th_events = spark.sql('SELECT * FROM GDELT WHERE ActionGeo_CountryCode="TH"').collect()
th_df = th_events.rdd.map(lambda p: "EventRootCode: " + p.eventcode).collect()
th_df
th_df.createOrReplaceTempView("TH_GDELT")
# rdd = sc.parallelize(new_list).map(lambda x: Row(feature1=x[0], feature2=x[1], pcc=x[2]))
# df_pcc = sqlContext.createDataFrame(rdd)
# df_pcc.createOrReplaceTempView("PCC")

<h2>Happiness and Sadness of Thai population In 2015</h2>
<p>The attributes that are specific to this analysis are 'AvgTone' representing 'emotion/sentiment', 'EventRootCode' representing 'type of events' and ActionGeo_CountryCode="TH" representing 'Thailand'</p>
<body>
    <center><h4>Sadness</h4></center>
<table>
<tr>
    <th>
        EventRootCode
    </th>
    <th>
        Event
    </th>
    <th>
        AvgTone
    </th>
</tr>
<tr>
    <td>
        18
    </td>
    <td>
        ASSAULT
    </td>
    <td> 
       -5.390966580175084 
    </td>
    </tr>
    <tr>
    <td>
        17
    </td>
    <td>
        COERCE
    </td>
    <td> 
       -4.980801102637645 
    </td>
    </tr>
       <tr>
    <td>
        19
    </td>
    <td>
        FIGHT
    </td>
    <td> 
       -4.61264282794244
    </td>
    </tr>
         <tr>
    <td>
        14
    </td>
    <td>
        PROTEST
    </td>
    <td> 
       -4.1006759595536755
    </td>
    </tr>
          <tr>
    <td>
        20
    </td>
    <td>
        ENGAGE IN UNCONVENTIONAL MASS VIOLENCE
    </td>
    <td> 
        -4.062580829729205
    </td>
    </tr>
</table>
        <center><h4>Happiness</h4></center>
<table>
<tr>
    <th>
        EventRootCode
    </th>
    <th>
        Event
    </th>
    <th>
        AvgTone
    </th>
</tr>
<tr>
    <td>
        5
    </td>
    <td>
        ENGAGE IN DIPLOMATIC COOPERATION
    </td>
    <td> 
       0.2415000501685426 
    </td>
    </tr>
    <tr>
    <td>
        3
    </td>
    <td>
        EXPRESS INTENT TO COOPERATE
    </td>
    <td> 
       -0.20847577040658494 
    </td>
    </tr>
       <tr>
    <td>
        4
    </td>
    <td>
        CONSULT
    </td>
    <td> 
       -1.024762808470614
    </td>
    </tr>
         <tr>
    <td>
        6
    </td>
    <td>
         ENGAGE IN MATERIAL COOPERATION
    </td>
    <td> 
       -1.350121166321724
    </td>
    </tr>
          <tr>
    <td>
        7
    </td>
    <td>
        PROVIDE AID
    </td>
    <td> 
        -1.4913063709509484
    </td>
    </tr>
</table>
</body>
<p>As you can see, from average tone even events which make people happy still got negative value so, I decided to go deeper to find which event makes people feel happiest and which one makes people feel saddest.</p>

In [4]:
th_avgtone=spark.sql('SELECT AVG(AvgTone) as Avg_Avgtone,EventRootCode from GDELT WHERE ActionGeo_CountryCode="TH" group by EventRootCode order by Avg_Avgtone Limit 5').show()
th_avgtone

+-------------------+-------------+
|        Avg_Avgtone|EventRootCode|
+-------------------+-------------+
| -5.390966580175084|           18|
| -4.980801102637645|           17|
|  -4.61264282794244|           19|
|-4.1006759595536755|           14|
| -4.062580829729205|           20|
+-------------------+-------------+



In [5]:
th_avgtone=spark.sql('SELECT AVG(AvgTone) as Avg_Avgtone,EventRootCode from GDELT WHERE ActionGeo_CountryCode="TH" group by EventRootCode order by Avg_Avgtone DESC Limit 5').show()
th_avgtone

+--------------------+-------------+
|         Avg_Avgtone|EventRootCode|
+--------------------+-------------+
|  0.2415000501685426|           05|
|-0.20847577040658494|           03|
|  -1.024762808470614|           04|
|  -1.350121166321724|           06|
| -1.4913063709509484|           07|
+--------------------+-------------+



<h2>The event which makes people happy the most is PROVIDE AID which AvgTone is 12.314225.</h2>

In [None]:
max_avgtone=spark.sql('SELECT max(AvgTone),EventRootCode from GDELT WHERE ActionGeo_CountryCode="TH" group by EventRootCode').show()
max_avgtone

+------------+-------------+
|max(AvgTone)|EventRootCode|
+------------+-------------+
|   12.314225|           07|
|   7.0422535|           15|
|   15.957447|           11|
|   13.768116|           01|
|   10.638298|           16|
|   10.869565|           18|
|   12.396694|           17|
|   22.077923|           09|
|   18.023256|           05|
|   10.185185|           19|
|   13.380281|           08|
|   13.577586|           03|
|    11.61413|           02|
|   12.765958|           06|
|   3.6281178|           20|
|   11.396104|           10|
|    9.178744|           12|
|   17.171717|           04|
|    9.219858|           13|
|        10.0|           14|
+------------+-------------+



In [None]:
min_avgtone=spark.sql('SELECT min(AvgTone),EventRootCode from GDELT WHERE ActionGeo_CountryCode="TH" group by EventRootCode').show()
min_avgtone

In [None]:
analysis = spark.sql('SELECT * from GDELT WHERE AvgTone = 12.314225')
analysis

Analysis
Measuring Economic Impact of Political Protest
The attributes that are specific to this analysis are ‘EventRootCode = 14’
representing ‘protest’ and ‘ActionGeo_CountryCode= TH’ representing the country ‘Thailand’

In [17]:
#‘EventRootCode = 14’ protest
th_protest = spark.sql('SELECT MonthYear,Actor1Name,Actor2Name,AvgTone,SOURCEURL from GDELT WHERE EventRootCode ="14" and ActionGeo_CountryCode="TH"').show()

+---------+-------------+----------+-----------+--------------------+
|MonthYear|   Actor1Name|Actor2Name|    AvgTone|           SOURCEURL|
+---------+-------------+----------+-----------+--------------------+
|   201509|         THAI|      THAI|       -4.8|http://www.techwo...|
|   201510|   GREENPEACE|  THAILAND| -3.8413877|http://ecowatch.c...|
|   201510|   GREENPEACE|  THAILAND| -3.8413877|http://ecowatch.c...|
|   201510|   POPULATION|      null| -1.9114689|http://thediploma...|
|   201411|        CHINA| PROTESTER|  -9.274471|http://www.newsfu...|
|   201411|        CHINA| PROTESTER|  -9.274471|http://www.newsfu...|
|   201411|     THAILAND| PROTESTER|  -9.274471|http://www.newsfu...|
|   201511|         null|GOVERNMENT|  -3.740563|http://www.whaleo...|
|   201511|    PROTESTER|      ARMY|  -3.740563|http://www.whaleo...|
|   201511|       PHUKET|   RUSSIAN|  0.6482982|http://www.thaipr...|
|   201511|MILITARY RULE|      null|-0.69124424|http://www.huffin...|
|   201411|     THAI

In [None]:
# from pyspark import SparkContext
# from pyspark.sql import Row

# rdd = sc.parallelize(new_list).map(lambda x: Row(feature1=x[0], feature2=x[1], pcc=x[2]))
# df_pcc = sqlContext.createDataFrame(rdd)
# df_pcc.createOrReplaceTempView("PCC")