In [1]:
from __future__ import print_function
%matplotlib inline
import matplotlib.pylab as plt
import sys, os, glob
import numpy as np
import subprocess
import datetime
import time

from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets

from IPython.display import HTML
import xml.etree.ElementTree as ET
try:
    tree = ET.parse(os.environ['HADOOP_CONF_DIR'] + '/yarn-site.xml')
except IOError:
    raise IOError("Can't find the yarn configuration -- is HADOOP_CONF_DIR set?")

In [2]:
root = tree.getroot()
yarn_web_app = root.findall("./property[name='yarn.resourcemanager.webapp.address']")[0].find('value').text
yarn_web_app_string = "If this works successfully, you can check the <a target='_blank' href='http://{yarn_web_app}'>YARN application scheduler</a> and you should see your app listed there. Clicking on the 'Application Master' link will bring up the familiar Spark Web UI. "

Load spark configuration setup when executing notebook by https://github.com/rokroskar/spark_workshop

In [3]:
# configuration directory
os.environ['SPARK_CONF_DIR'] = os.path.realpath('../../spark_workshop/notebooks/twitter_dataframes/spark_config')

# number of cores
ncores = int(os.environ.get('LSB_DJOB_NUMPROC', 1)) 

# here we set the memory we want spark to use for the driver JVM
os.environ['SPARK_DRIVER_MEMORY'] = '%dG'%(ncores*0.7)

# python executable
os.environ['PYSPARK_PYTHON'] = subprocess.check_output('which python', shell=True).rstrip()

In [4]:
import findspark
findspark.init()

import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, HiveContext
import pyspark.sql.functions as func
from pyspark.sql import Row, Window
from pyspark.sql.types import IntegerType, ArrayType, StringType, NullType, LongType, StructField, \
                              StructType, DateType, DataType, DateConverter, DatetimeConverter, \
                              TimestampType, BooleanType

### Start context

In [5]:
conf = SparkConf()
sc = SparkContext(master='yarn-client', conf=conf)

### Load twitter data

In [6]:
hc = HiveContext(sc)

Read in the parquet files by Iza and Rok containing the twitter data

In [7]:
%%time
data = hc.read.parquet('/user/roskarr/twitter/2014_12*')

CPU times: user 8 ms, sys: 3 ms, total: 11 ms
Wall time: 36 s


### Pyspark configuration

In [8]:
sc._conf.getAll()

[(u'spark.yarn.executor.memoryOverhead', u'2048'),
 (u'spark.master', u'yarn-client'),
 (u'spark.sql.parquet.mergeSchema', u'true'),
 (u'spark.driver.memory', u'140G'),
 (u'spark.yarn.am.memory', u'8g'),
 (u'spark.executor.cores', u'4'),
 (u'spark.executor.instances', u'20'),
 (u'spark.app.name', u'twitter_dataframes'),
 (u'spark.shuffle.memoryFraction', u'0.2'),
 (u'spark.storage.memoryFraction', u'0.5'),
 (u'spark.rdd.compress', u'True'),
 (u'spark.serializer.objectStreamReset', u'100'),
 (u'spark.yarn.am.cores', u'4'),
 (u'spark.yarn.isPython', u'true'),
 (u'spark.submit.deployMode', u'client'),
 (u'spark.executor.memory', u'8g')]

### Read in our list containing the keywords

In [9]:
def get_keys(path='../docs/keywords.txt'):
    keys = np.genfromtxt(path, dtype="|S20", delimiter='#', autostrip=True)
    return keys.tolist()

In [10]:
keys = get_keys()
print(keys)

['isis', 'terrorism', 'arab', 'spring', 'attack', 'god', 'christian', 'allah', 'islam', 'syria', 'refugees', 'migrants', 'africa', 'italy', 'ethiopia', 'asylum', 'unhcr', 'immigration', 'foreigners', 'crowded', 'ebola', 'guinea', 'sierra', 'leone', 'liberia', 'virus', 'epidemic', 'vaccine', 'who', 'influenza', 'flu', 'birds', 'swine', 'pig', 'bitcoin', 'rosetta', 'comet', 'higgs', 'climate', 'doomsday', 'maya', 'curiosity', 'sandy', 'hurricane', 'black', 'white', 'mandela', 'nelson', 'left', 'right', 'mh17', 'mh370', 'ukraine', 'crimea', 'russia', 'snowden', 'nsa', 'obama', 'putin', 'pope', 'unemployment', 'boston', 'marathon', 'london', 'europe', 'usa', 'philippines', 'sochi', 'olympics', 'geneva', 'apple', 'linux', 'PC', 'google', 'iphone', 'galaxy', 'watch', 'facebook', 'twitter', 'whatsapp', 'vegan', 'gluten', 'vegetarian', 'meat', 'pasta', 'banana', 'family', 'divorce', 'marriage', 'wedding', 'holidays', 'homework', 'television', 'coffee', 'tea', 'school', 'work', 'teacher', 'spor

### Custom functions performing operations on the columns

In [11]:
# Change the date format to YYYY-MM-DD HH:MM:SS
convert_date_string = func.udf(lambda date_string: datetime.date.strftime(datetime.datetime.strptime(date_string, \
                                                                           '%a %b %d %H:%M:%S +0000 %Y'),\
                                                                           '%Y-%m-%d %H:%M:%S'
                                                                         ), StringType())

# Convert the date string to a datetime object
datetime_udf = func.udf(lambda date_string: datetime.strptime(date_string, '%a %b %d %H:%M:%S +0000 %Y'), DateType())

# Map the datetime object column to a unique number for every day in any year
new_date = (lambda col: func.dayofyear(col) + func.year(col)*1000)

# Convert the tweet text string to lowercase and split
text_udf = func.udf(lambda row: row.lower().split(), returnType=ArrayType(StringType()))

# Boolean containing whether a keyword appears in the text
keys_filter_udf = func.udf(lambda row: len([val for val in row if val in keys]) > 0, returnType=BooleanType())

# Returns list of keys contained in text
keys_list_udf = func.udf(lambda row: [val for val in row if val in keys], returnType=ArrayType(StringType()))

## Filter dataframe to contain only tweets with at least one appearance of one of the keywords

Number of partitions (set really high to avoid exit codes 52s (out of memory))

In [12]:
Npartitions = sc.defaultParallelism*200
print(Npartitions)

16000


In [13]:
keywords_df = (data.select('created_at', 'text')
                   .filter(func.length('text') > 0)
                   .withColumn('created_at', convert_date_string('created_at'))
                   .withColumn('created_at', new_date('created_at'))
                   .withColumn('text', text_udf('text'))
                   .repartition(Npartitions)
                   .filter(keys_filter_udf('text'))
                   .repartition(Npartitions//10)
                   .withColumn('keys', keys_list_udf('text'))
                   .select('created_at', 'keys')
               )
keywords_df.cache()

DataFrame[created_at: int, keys: array<string>]

## Filter dataset by keyword, group by day, count and collect

In [16]:
outdir = '/cluster/home/phys/bruclaud/Documents/spuriousrelations/data/'

In [17]:
def get_csvlist(df, kkey):
    csvlist = (df.filter(func.udf(lambda row: kkey in row, returnType=BooleanType())('keys'))
#                  .repartition(Npartitions//10)
                 .groupBy('created_at')
                 .count()
                 .rdd
                 .map(lambda x: ",".join(map(str, x)))
                 .collect())
    return csvlist

In [18]:
for key in keys[:50]:
    start = time.time()
    csvvalues = get_csvlist(keywords_df, key)
    np.savetxt(outdir+key+'.csv', np.array(csvvalues), fmt='%s')
    print(key, ': Time: {:.1f}s'.format(time.time()-start))

isis : Time: 188.3s
terrorism : Time: 4.6s
arab : Time: 5.0s
spring : Time: 6.7s
attack : Time: 6.0s
god : Time: 6.3s
christian : Time: 7.2s
allah : Time: 6.1s
islam : Time: 5.4s
syria : Time: 4.2s
refugees : Time: 5.0s
migrants : Time: 3.3s
africa : Time: 5.7s
italy : Time: 6.4s
ethiopia : Time: 3.3s
asylum : Time: 5.4s
unhcr : Time: 3.1s
immigration : Time: 5.3s
foreigners : Time: 3.3s
crowded : Time: 6.1s
ebola : Time: 5.9s
guinea : Time: 3.7s
sierra : Time: 4.8s
leone : Time: 3.8s
liberia : Time: 5.0s
virus : Time: 6.4s
epidemic : Time: 7.1s
vaccine : Time: 3.7s
who : Time: 6.5s
influenza : Time: 3.3s
flu : Time: 5.4s
birds : Time: 5.4s
swine : Time: 3.7s
pig : Time: 5.0s
bitcoin : Time: 4.8s
rosetta : Time: 3.4s
comet : Time: 5.6s
higgs : Time: 13.3s
climate : Time: 15.4s
doomsday : Time: 13.7s
maya : Time: 17.4s
curiosity : Time: 16.8s
sandy : Time: 17.4s
hurricane : Time: 19.2s
black : Time: 22.0s
white : Time: 26.6s
mandela : Time: 25.5s
nelson : Time: 31.4s
left : Time: 37.1s


In [19]:
for key in keys[50:]:
    start = time.time()
    csvvalues = get_csvlist(keywords_df, key)
    np.savetxt(outdir+key+'.csv', np.array(csvvalues), fmt='%s')
    print(key, ': Time: {:.1f}s'.format(time.time()-start))

mh17 : Time: 51.8s
mh370 : Time: 68.9s
ukraine : Time: 108.3s
crimea : Time: 277.1s
russia : Time: 3905.0s


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: java.lang.IllegalStateException: unread block data
	at java.io.ObjectInputStream$BlockDataInputStream.setBlockDataMode(ObjectInputStream.java:2394)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1379)
	at java.io.ObjectInputStream.readArray(ObjectInputStream.java:1685)
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1341)
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:369)
	at org.apache.spark.MapOutputTracker$$anonfun$deserializeMapStatuses$2.apply(MapOutputTracker.scala:509)
	at org.apache.spark.MapOutputTracker$$anonfun$deserializeMapStatuses$2.apply(MapOutputTracker.scala:509)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1250)
	at org.apache.spark.MapOutputTracker$.deserializeMapStatuses(MapOutputTracker.scala:510)
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$newOrUsedShuffleStage(DAGScheduler.scala:355)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$org$apache$spark$scheduler$DAGScheduler$$getShuffleMapStage$1.apply(DAGScheduler.scala:286)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$org$apache$spark$scheduler$DAGScheduler$$getShuffleMapStage$1.apply(DAGScheduler.scala:285)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
	at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
	at scala.collection.mutable.Stack.foreach(Stack.scala:169)
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$getShuffleMapStage(DAGScheduler.scala:285)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$visit$1$1.apply(DAGScheduler.scala:389)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$visit$1$1.apply(DAGScheduler.scala:386)
	at scala.collection.immutable.List.foreach(List.scala:318)
	at org.apache.spark.scheduler.DAGScheduler.visit$1(DAGScheduler.scala:386)
	at org.apache.spark.scheduler.DAGScheduler.getParentStages(DAGScheduler.scala:398)
	at org.apache.spark.scheduler.DAGScheduler.getParentStagesAndId(DAGScheduler.scala:299)
	at org.apache.spark.scheduler.DAGScheduler.newResultStage(DAGScheduler.scala:334)
	at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:837)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1607)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor97.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:601)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:722)


In [None]:
keywords_df.rdd.saveAsPickleFile('/user/bruclaud/keywords_df')

In [None]:
!hadoop fs -du -h /user/bruclaud/

In [None]:
sc.stop()