In [1]:
import os
import sys
import socket
import re
import numpy as np
import string
from timeit import default_timer as timer
from datetime import datetime
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,lower,rand, lit
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

In [2]:
print('Hostname:', socket.gethostname())
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "get-tweets-for-labeling").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "get-tweets-for-labeling").getOrCreate()
spark

Hostname: Samuels-MacBook-Pro.local


In [3]:
country_code = "US"
print('Country:', country_code)

# Local
if  'samuel' in socket.gethostname().lower():
    path_to_data = os.path.join('../../data/classification',country_code)
# Cluster
else:
    path_to_data = os.path.join('/user/spf248/twitter/data/classification',country_code)
print('Path to data:',path_to_data)

Country: US
Path to data: ../../data/classification/US


In [4]:
print('Import tweets containing keywords')
filtered = spark.read.parquet(os.path.join(path_to_data,'filtered'))
filtered.cache()

Import tweets containing keywords


DataFrame[tweet_id: string, text: string, fired: boolean, hired: boolean, job: boolean, laid_off: boolean, position: boolean, quit: boolean, unemployed: boolean, work: boolean, keyword: boolean]

In [10]:
filtered.show()

+------------------+--------------------+-----+-----+-----+--------+--------+-----+----------+-----+-------+
|          tweet_id|                text|fired|hired|  job|laid_off|position| quit|unemployed| work|keyword|
+------------------+--------------------+-----+-----+-----+--------+--------+-----+----------+-----+-------+
|616716023677358080|RT @HBMONTE: Sorr...|false|false|false|   false|   false|false|     false| true|   true|
|616716984114241536|Top frustrations ...|false|false| true|   false|   false|false|     false|false|   true|
|616718917679972352|Having two jobs n...|false|false| true|   false|   false|false|     false|false|   true|
|616719991191121920|Can you recommend...|false|false| true|   false|   false|false|     false|false|   true|
|616721396559802369|★ JOB ★ #hiring #...|false|false| true|   false|   false|false|     false|false|   true|
|616721396597592064|RT @h0lymal0ley: ...|false|false| true|   false|   false|false|     false|false|   true|
|616721832536707073

In [5]:
print('Import random tweets')
random = spark.read.parquet(os.path.join(path_to_data,'random'))
random.cache()

Import random tweets


DataFrame[tweet_id: string, text: string, fired: boolean, hired: boolean, job: boolean, laid_off: boolean, position: boolean, quit: boolean, unemployed: boolean, work: boolean, keyword: boolean]

In [9]:
random.show()

+------------------+--------------------+-----+-----+-----+--------+--------+-----+----------+-----+-------+
|          tweet_id|                text|fired|hired|  job|laid_off|position| quit|unemployed| work|keyword|
+------------------+--------------------+-----+-----+-----+--------+--------+-----+----------+-----+-------+
|554436170910691328|RT @Wize_Quotes: ...|false|false|false|   false|   false|false|     false|false|  false|
|554438989349142528|@mobleydick @Emma...|false|false|false|   false|   false|false|     false|false|  false|
|554439949903491073|@Muslims_USA @net...|false|false|false|   false|   false|false|     false|false|  false|
|554440289465532416|@brintown time to...|false|false|false|   false|   false| true|     false|false|   true|
|554441346732544000|🌹🌹🌹🌹🌹🌹🌹🌹?...|false|false|false|   false|   false|false|     false|false|  false|
|554443070431703041|Put my mouth on t...|false|false|false|   false|   false|false|     false|false|  false|
|554444458717351936|RT @wek

In [6]:
print('Import scores')

schema = StructType([StructField('tweet_id', StringType(), False),
                     StructField('score', FloatType(), False),
                     StructField('target', StringType(), False)])

scores = spark.read.option('header','true').schema(schema).csv(os.path.join(path_to_data,'similarity'))
# scores = spark.read.option('header','true').schema(schema).csv(os.path.join(path_to_data,'similarity','target-*-partition-0.csv'))
scores.cache()

Import scores


DataFrame[tweet_id: string, score: float, target: string]

In [7]:
print('Drop Duplicated Scores (Random Sample Could Contain Keywords)')
scores = scores.drop_duplicates(subset=['tweet_id','target'])

Drop Duplicated Scores (Random Sample Could Contain Keywords)


In [8]:
scores.show()

+-------------------+------------+-------------------+
|           tweet_id|       score|             target|
+-------------------+------------+-------------------+
|1052224759289638913|  0.99993515|I lost my job today|
|1010299704473616386|    0.997035|I lost my job today|
|1076337943013404674|  0.99941576|I lost my job today|
|1031567199725658112|   0.9941824|I lost my job today|
|1056879962764959744|   0.9997024|I lost my job today|
|1057680248546840576|  0.99911946|I lost my job today|
|1060564260386938885|  0.99918884|I lost my job today|
|1030485111236636672|  0.99636996|I lost my job today|
|1021533103339143168|  0.99590516|I lost my job today|
|1028399598405144577|   0.9998903|I lost my job today|
|1033797306959781891|   0.9929697|I lost my job today|
|1052557329839923201|   0.9927331|I lost my job today|
|1024405878621921281|  0.99839586|I lost my job today|
|1031664490809622529|  0.97725874|I lost my job today|
|1034964594333306880|   0.9918081|I lost my job today|
|105766366

In [11]:
keywords=sorted([keyword for keyword in filtered.columns if keyword not in ['tweet_id','text','keyword']])
print('Keywords:\n')
print('\n'.join(keywords))

Keywords:

fired
hired
job
laid_off
position
quit
unemployed
work


In [9]:
targets=sorted(scores.select("target").distinct().rdd.map(lambda r: r[0]).collect())
print('Targets:\n')
print('\n'.join(targets))

Targets:

I lost my job today


In [10]:
print('Create Sample for Labeling')

schema = StructType([StructField('tweet_id', StringType(), False),
                     StructField('text', StringType(), False),
                     StructField('keyword', StringType(), False),
                     StructField('target', StringType(), False)])

tweets_for_labeling = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
# tweets_for_labeling.cache()

Create Sample for Labeling


In [11]:
n_sample = 100
print('# sampled tweets per group:', n_sample)

# sampled tweets per group: 100


In [12]:
for keyword in keywords:
    
    print(keyword)
    
    # Select Tweets Containing Specific Keyword
    tmp = filtered.where(filtered[keyword]==True)
#     print('# Tweets:',tmp.count())
    
    # Remove Those Which Have Already Been Sampled
    tmp = tmp.join(tweets_for_labeling.select('tweet_id'),on='tweet_id',how='left_anti')  
#     print('# Tweets:',tmp.count())
    
    # Take Random Sample of Size n_sample
    tmp = tmp.orderBy(rand(seed=0)).limit(n_sample).select('tweet_id','text')
#     print('# Tweets:',tmp.count())
    
    # Keep Track of Sampling Properties
    tmp = tmp.withColumn("keyword",lit(keyword.replace('_',' ')))
    tmp = tmp.withColumn("target",lit('random'))
    
    tweets_for_labeling = tweets_for_labeling.union(tmp)

fired
hired
job
laid_off
position
quit
unemployed
work


In [13]:
for keyword in keywords:

    print(keyword)
    
    for target in targets:
        
        print(target)
        
        # Select Tweets Containing Specific Keyword
        tmp = filtered.where(filtered[keyword]==True)
#         print('# Tweets:',tmp.count())
    
        # Merge With Similarity Scores
        tmp = tmp.join(scores.filter(scores['target']==target),on='tweet_id')
#         print('# Tweets:',tmp.count())
        
        # Remove Tweets Which Have Already Been Sampled
        tmp = tmp.join(tweets_for_labeling.select('tweet_id'),on='tweet_id',how='left_anti')  
#         print('# Tweets:',tmp.count())
        
        # Take n_sample Tweets Most Similar with Target Sentence
        tmp = tmp.sort(col('score').desc()).limit(n_sample).select('tweet_id','text')
#         print('# Tweets:',tmp.count())
        
        # Keep Track of Sampling Properties
        tmp = tmp.withColumn("keyword",lit(keyword.replace('_',' ')))
        tmp = tmp.withColumn("target",lit(target))
    
        tweets_for_labeling = tweets_for_labeling.union(tmp)
        
    print()

fired
I lost my job today

hired
I lost my job today

job
I lost my job today

laid_off
I lost my job today

position
I lost my job today

quit
I lost my job today

unemployed
I lost my job today

work
I lost my job today



In [14]:
for target in targets:
    
    print(target)
    
    # Merge Random Tweets With Similarity Scores With Target
    tmp=random.join(scores.filter(scores['target']==target),on='tweet_id')
#     print('# Tweets:',tmp.count())
    
    # Remove Tweets Which Have Already Been Sampled
    tmp = tmp.join(tweets_for_labeling.select('tweet_id'),on='tweet_id',how='left_anti')  
#     print('# Tweets:',tmp.count())
    
    # Take n_sample Tweets Most Similar with Target Sentence
    tmp = tmp.sort(col('score').desc()).limit(n_sample).select('tweet_id','text')
#     print('# Tweets:',tmp.count())
    
    # Keep Track of Sampling Properties
    tmp = tmp.withColumn("keyword",lit('random'))
    tmp = tmp.withColumn("target",lit(target))

    tweets_for_labeling = tweets_for_labeling.union(tmp)

I lost my job today


In [15]:
print('Save')
tweets_for_labeling.write.mode("overwrite").parquet(os.path.join(path_to_data,'labeling'))

Save


Py4JJavaError: An error occurred while calling o587.parquet.
: java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOf(Arrays.java:3332)
	at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:124)
	at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:448)
	at java.lang.StringBuilder.append(StringBuilder.java:136)
	at scala.collection.mutable.StringBuilder.append(StringBuilder.scala:210)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:553)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$generateTreeString$3.apply(TreeNode.scala:566)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$generateTreeString$3.apply(TreeNode.scala:566)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:566)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$generateTreeString$3.apply(TreeNode.scala:566)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$generateTreeString$3.apply(TreeNode.scala:566)
