In [3]:
import time
from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tweepy

from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.util import ngrams

import pyprind

In [2]:
!pip install PyPrind



In [5]:
sns.set()

## Setup tweepy api client

In [82]:
CONSUMER_KEY = ''
CONSUMER_SECRET = ''

ACCESS_TOKEN = ''
ACCESS_TOKEN_SECRET = ''

In [7]:
# Use tweepy to invoke Twitter API

# Set app and user access tokens
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

# Create API client
api = tweepy.API(auth,wait_on_rate_limit=True)

In [77]:
# Define helper function for parsing Twitter API tweet responses

def parse_tweet(tweet):
    """
    Selects relevant fields from tweet status object
    https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object
    """
    return {
            'user_id': tweet.user.id,
            'text': tweet.full_text,
            'created_at': tweet.created_at,
            'id_str': tweet.id_str,
            'in_reply_to_status_id_str': tweet.in_reply_to_status_id_str,
            'is_retweet': hasattr(tweet, 'retweeted_status'),
            'is_quote': hasattr(tweet, 'quoted_status'),
            'media': tweet.entities.media if hasattr(tweet.entities, 'media') else [],
        }

In [73]:
# sarcasm_query = '%23sarcasm%20lang%3Aen%20-filter%3Alinks'
sarcasm_query = '(%23sarcasm%2C%20OR%20%23sarcastic)' # %20lang%3Aen%20-filter%3Alinks'
not_sarcasm_hashtag_query = '(%23notsarcasm%2C%20OR%20%23notsarcastic)'
not_sarcasm_text_query = '("not%20sarcasm"%20OR%20"not%20sarcastic")'
# not_sarcasm_query = '(%23notsarcasm%2C%20OR%20%23notsarcastic)%20lang%3Aen%20-filter%3Alinks'
timeout = 999999

## Request tweets labeled "#sarcasm"
Data restricted to 7 days old and represents a *sample* of true tweets

In [80]:
pbar = pyprind.ProgBar(5000)
tweets = []

for tweet in tweepy.Cursor(
    api.search,
    q=sarcasm_query,
    result_type='recent',
    include_entities=True,
    tweet_mode='extended',
    lang='en',
    timeout=timeout
).items(5000):
    pbar.update()
    t = parse_tweet(tweet)
    tweets.append(t)

print(f"Fetched {len(tweets)} tweets.")

0% [############                  ] 100% | ETA: 00:01:28

TweepError: Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=60)

In [81]:
df_tweets = pd.DataFrame(tweets)
# Index replies by ids
df_tweets.set_index('id_str', drop=True, inplace=True)
df_tweets

Unnamed: 0_level_0,user_id,text,created_at,in_reply_to_status_id_str,is_retweet,is_quote,media
id_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1227846063844941825,228127192,@realDonaldTrump No one is as pure of heart as...,2020-02-13 06:44:48,1227561237782855680,False,False,[]
1227845689167634433,617538743,Here's a logical thought process: After s***ti...,2020-02-13 06:43:19,,False,False,[]
1227843998133473280,52338207,@JenJenWren777 @Paul_Goode78 @JuliaHB1 @darren...,2020-02-13 06:36:36,1227699349498552321,False,False,[]
1227843786522464256,14514563,@jttanenbaum Twitter's translate button doesn'...,2020-02-13 06:35:45,1227823668891439105,False,False,[]
1227843649557467139,51457642,25 Mostly Pointless but Snappy Comebacks When ...,2020-02-13 06:35:12,,False,False,[]
...,...,...,...,...,...,...,...
1225771358254256129,4874225072,RT @John95183905: YOU Will Fall From The Chair...,2020-02-07 13:20:40,,True,False,[]
1225770941692751878,10712162,When U write a test with tons of mocks and wir...,2020-02-07 13:19:00,,False,False,[]
1225770477295001600,1048016189123776512,@CarpeDonktum @DonaldJTrumpJr @BlognificentB H...,2020-02-07 13:17:10,1225648878617124864,False,False,[]
1225770132473044999,1222612937430511617,YOU Will Fall From The Chair When You See This...,2020-02-07 13:15:47,,False,False,[]


In [18]:
# save dataset locally
df_tweets.to_json('data/sarcasm-2-11-2.json')

In [19]:
df_tweets.in_reply_to_status_id_str.value_counts()

1225623348748546049    3
1225561188844670978    3
1225094231603253249    2
1226958297779048448    2
1227333809680588806    2
                      ..
1227060244980744194    1
1225679832152858626    1
1225390783622107137    1
1227285098405351424    1
1225510146870214656    1
Name: in_reply_to_status_id_str, Length: 960, dtype: int64

In [17]:
# todo: include context for replies

# df_replies_with_tweets = df_replies[['text', 'in_reply_to_status_id_str']].groupby('in_reply_to_status_id_str').agg(list)
# df_replies_tweets = pd.concat([df_replies_with_tweets, df_tweets[['text']]], axis=1, join='inner')
# df_replies_tweets.columns = ['replies', 'tweet']
# df_replies_tweets

In [None]:
# save dataset locally
df_replies_tweets.to_json('data/replies_tweets.json')

## Request tweets labeled "#notsarcasm"
Data restricted to 7 days old and represents a *sample* of true tweets

In [74]:
pbar1 = pyprind.ProgBar(5)
not_sarcastic_tweets = []

for tweet in tweepy.Cursor(
    api.search,
    q=not_sarcasm_hashtag_query,
    result_type='recent',
    include_entities=True,
    tweet_mode='extended',
    lang='en',
    timeout=timeout
).items(20):
    pbar1.update()
#     print(tweet)
    t = parse_tweet(tweet)
    not_sarcastic_tweets.append(t)

# for tweet in tweepy.Cursor(api.search,q=not_sarcasm_text_query,result_type='recent',include_entities=True,timeout=timeout).items(5000):
#     pbar1.update()
#     t = parse_tweet(tweet)
#     not_sarcastic_tweets.append(t)


print(f"Fetched {len(not_sarcastic_tweets)} tweets.")

0% [#####] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00


Fetched 20 tweets.


In [72]:
df_not_sarcastic_tweets = pd.DataFrame(not_sarcastic_tweets)
df_not_sarcastic_tweets.set_index('id_str', drop=True, inplace=True)
df_not_sarcastic_tweets

Unnamed: 0_level_0,user_id,text,created_at,in_reply_to_status_id_str,is_retweet,is_quote,media,lang
id_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1227749497561853953,898183456546578433,@JayzTwoCents @Reylie14 @Phanteks @ASUS_ROG @T...,2020-02-13 00:21:05,1.227474066979926e+18,False,False,[],en
1227659302401462273,270956500,@JHayes007 @PepperellEddie Check out the laugh...,2020-02-12 18:22:41,1.227581200891568e+18,False,False,[],en
1227656422676160512,1588253204,The picture illustrates Latvian Air Force - th...,2020-02-12 18:11:14,1.2276544352932332e+18,False,False,[],en
1227353626684919808,352705909,I asked my grandma to be my valentine. She sai...,2020-02-11 22:08:02,,False,False,[],en
1227255897321345026,856629957774241792,"@fvalemus @AndrewYang Good luck today, #YangGa...",2020-02-11 15:39:41,1.2248570628252915e+18,False,False,[],en
1226515059767218177,1001827873727090689,@OverUnderClover @rob_sheridan @voxdotcom Nebu...,2020-02-09 14:35:52,1.226512565628539e+18,False,False,[],en
1226224636125425664,218701194,RT @realJOELP: @PettyWhiteJr @jerweber @TheRic...,2020-02-08 19:21:50,,True,False,[],en
1226211723272577025,38493367,@PettyWhiteJr @jerweber @TheRickWilson Always ...,2020-02-08 18:30:31,1.2262088264975647e+18,False,False,[],en
1225961445793378306,39090194,RT @NewWorldBank2: Dear #XRPCommunity your Que...,2020-02-08 01:56:00,,True,False,[],en
1225950857323257856,991383686250627072,Dear #XRPCommunity your Queen @NordicAnn has m...,2020-02-08 01:13:56,,False,False,[],en


In [37]:
# save dataset locally
df_not_sarcastic_tweets.to_json('data/not-sarcasm-2-12.json')

In [58]:
df_not_sarcastic_tweets.text.values

array(['@JayzTwoCents @Reylie14 @Phanteks @ASUS_ROG @Thermaltake @Razer This kind of authenticity makes me like you more. #notsarcasm',
       '@JHayes007 @PepperellEddie Check out the laugh as he said it... that is what we like to call S...A...R...C....A....S....M. Bet his caddie was wetting himself laugh on the far side of the camera.... again ... sarcasm. See... it’s nice n easy... #notsarcasm',
       'The picture illustrates Latvian Air Force - the most aviatic Baltic country #notsarcasm',
       'I asked my grandma to be my valentine. She said yes.. 2020 is starting off way better than 2019 #NotSarcastic',
       'Laki gue kok cerdas banget sih\n\n#notsarcasm',
       '@fvalemus @AndrewYang Good luck today, #YangGang!   Sincerely, #BernGerng         #NotSarcasm #Solidarity',
       '@OverUnderClover @rob_sheridan @voxdotcom Nebulous...wow I’ve never seen such an intelligent word used on Twitter. Gotta get that into my vocabulary 🤔🤓🤓🤔 #notsarcasm',
       'RT @realJOELP: @PettyWhi

In [40]:
df_not_sarcastic_tweets.text[0:1].values

array(['@JHayes007 @PepperellEddie Check out the laugh as he said it... that is what we like to call S...A...R...C....A....… https://t.co/A4CmfZpDMx'],
      dtype=object)

In [43]:
df_not_sarcastic_tweets[0:10]

Unnamed: 0_level_0,user_id,user_screenname,text,truncated,created_at,in_reply_to_status_id_str,retweet_count,favorite_count,media,lang
id_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1227659302401462273,270956500,GungaNagunga,@JHayes007 @PepperellEddie Check out the laugh...,True,2020-02-12 18:22:41,1.227581200891568e+18,0,1,[],en
1227656422676160512,1588253204,FactorOk,The picture illustrates Latvian Air Force - th...,False,2020-02-12 18:11:14,1.2276544352932332e+18,0,2,[],en
1227353626684919808,352705909,FaithhLawrence,I asked my grandma to be my valentine. She sai...,False,2020-02-11 22:08:02,,0,2,[],en
1227295610430054400,782964356023627776,AraKaito,Laki gue kok cerdas banget sih\n\n#notsarcasm,False,2020-02-11 18:17:30,,0,0,[],in
1227255897321345026,856629957774241792,lady__doobie,"@fvalemus @AndrewYang Good luck today, #YangGa...",False,2020-02-11 15:39:41,1.2248570628252915e+18,0,0,[],en
1226515059767218177,1001827873727090689,CarrieGOT7monsX,@OverUnderClover @rob_sheridan @voxdotcom Nebu...,True,2020-02-09 14:35:52,1.226512565628539e+18,0,3,[],en
1226224636125425664,218701194,northkats,@PettyWhiteJr @jerweber @TheRickWilson Always ...,False,2020-02-08 19:21:50,,1,0,[],en
1226211723272577025,38493367,realJOELP,@PettyWhiteJr @jerweber @TheRickWilson Always ...,False,2020-02-08 18:30:31,1.2262088264975647e+18,1,8,[],en
1225961445793378306,39090194,XRPNews_,Dear #XRPCommunity your Queen @NordicAnn has m...,False,2020-02-08 01:56:00,,1,0,[],en
1225950857323257856,991383686250627072,NewWorldBank2,Dear #XRPCommunity your Queen @NordicAnn has m...,True,2020-02-08 01:13:56,,1,14,[],en
