# get disaster time series data from twitter

## setup connection

#### imports

In [266]:
import sys, json
import twitter, re, datetime, pandas as pd
from datetime import datetime as dt
import pickle

#### some paths

In [272]:
rawdumps_path = '../data/raw/jawiki/dumps_unzipped/'
processed_path = '../data/processed/jawiki/'

#### define class

In [144]:
class TweetMiner(object):

    result_limit    =   2    
    api             =   False
    data            =   []
    
    def __init__(self, keys_dict, api, result_limit = 2):
        
        self.api = api
        self.twitter_keys = keys_dict
        
        self.result_limit = result_limit
        

    def mine_user_tweets(self, user="jma_bousai", mine_retweets=True, max_pages=5, last_tweet_id=False):

        data           =  []
        last_tweet_id  =  last_tweet_id
        page           =  1
        
        while page <= max_pages:
            
            if last_tweet_id:
                statuses   =   self.api.GetUserTimeline(screen_name=user, count=self.result_limit, max_id=last_tweet_id - 1)        
            else:
                statuses   =   self.api.GetUserTimeline(screen_name=user, count=self.result_limit)
                
            for item in statuses:

                mined = {
                    'tweet_id':        item.id,
                    'handle':          item.user.name,
                    'retweet_count':   item.retweet_count,
                    'full_text':            item.text,
                    'mined_at':        datetime.datetime.now(),
                    'created_at':      item.created_at,
                }
                
                last_tweet_id = item.id
                data.append(mined)
                
            page += 1
            print(page)
        return data

#### get twitter_keys

In [17]:
import importlib.util
spec = importlib.util.spec_from_file_location("twitter_keys", "../../.twitter_keys.py")
tw_keys = importlib.util.module_from_spec(spec)
spec.loader.exec_module(tw_keys)
twitter_keys = tw_keys.twitter_keys

#### instantiate

In [145]:
api = twitter.Api(
    consumer_key         =   twitter_keys['consumer_key'],
    consumer_secret      =   twitter_keys['consumer_secret'],
    access_token_key     =   twitter_keys['access_token_key'],
    access_token_secret  =   twitter_keys['access_token_secret']
)

miner = TweetMiner(twitter_keys, api, result_limit=20)

#### unused

In [None]:
mcc = miner.mine_user_tweets(user='McConnellPress', max_pages=100)
aoc = miner.mine_user_tweets(user='aoc', max_pages=100)

pon = miner.mine_user_tweets(user='Pontifex', max_pages=100)


# ### Convert the tweet ouputs to a pandas DataFrame
mcc = pd.DataFrame(mcc)
aoc = pd.DataFrame(aoc)
pon = pd.DataFrame(pon)

mcc['user'] = 'mcc'
aoc['user'] = 'aoc'
pon['user'] = 'pon'


# ##  Create the training data
# 1. Mine Trump tweets
# - Create a tweet DataFrame
# - Mine Sanders tweets
# - Append the results to our DataFrame
df = pd.concat((aoc[['user', 'text']], 
                mcc[['user', 'text']], 
                pon[['user', 'text']])).reset_index(drop=True)

X = df.text
y = df.user

from sklearn.model_selection import train_test_split
Xtr, Xte, ytr, yte = train_test_split(X,y, test_size=.2)

# ## Any interesting ngrams going on with Trump?
# ---
# 
# Set up a vectorizer from sklearn and fit the text of Trump's tweets with an ngram range from 2 to 4. Figure out what the most common ngrams are.
# 
# > **Note:** It's up to you whether you want to remove stopwords or not. How does keeping or removing stopwords affect the results?

# In[95]:


from sklearn.feature_extraction.text import CountVectorizer


# ### Look at the ngrams for Bernie Sanders

# In[96]:


ctvec0 = CountVectorizer(ngram_range=(2,2), stop_words='english')


# In[97]:


ctvec0.fit(Xtr)


# In[98]:


Xtr_vec0 = ctvec0.transform(Xtr)
Xtr_words0 = ctvec0.get_feature_names_out()


# In[99]:


Xtr_df0 = pd.DataFrame.sparse.from_spmatrix(Xtr_vec0, columns=Xtr_words0)


# In[100]:


Xtr_df0.sum().sort_values(ascending=False).head(10)


# ## Processing the tweets and building a model
# 
# ---
# 
# To do classfication we will need to convert the tweets into a set of features.
# 
# **You will need to:**
# - Vectorize input text data.
# - Intialize a model (try Logistic regression).
# - Train / Predict / cross-validate.
# - Evaluate the performance of the model.
# 
# > **Bonus:** you may have noticed that there are website links in the tweets. What additional preprocessing steps can you do before building the model?
# 

# In[11]:


# BONUS
# Using the textacy package to do some more comprehensive preprocessing
# http://textacy.readthedocs.io/en/latest/

# !conda install --yes --prefix {sys.prefix} textacy 


# In[12]:


# !conda install -c conda-forge textacy -y

import textacytextacy.__version__from textacy.preprocessing.normalize import normalize_quotation_marks, normalize_whitespace, normalize_hyphenated_words
from textacy.preprocessing.remove import remove_accents, remove_punctuation
from textacy.preprocessing.replace import replace_user_handles,replace_currency_symbols,replace_emails,replace_urls,replace_phone_numbers,replace_emojis,replace_numbers
# In[101]:


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from collections import Counter

# We can use the TfidfVectorizer to find ngrams for us
#vect = TfidfVectorizer(ngram_range=(2,4),stop_words=text.ENGLISH_STOP_WORDS.union(['https','co']))
vect = TfidfVectorizer(ngram_range=(2,4),stop_words=text.ENGLISH_STOP_WORDS)

# Pulls all of Scottie's tweet text's into one giant string
summaries = "".join(df['text'])
ngrams_summaries = vect.build_analyzer()(summaries)

Counter(ngrams_summaries).most_common(20)


## get data from @jma_bousai

### raw results

#### result 2

In [153]:
display(
    len(jma_bousai[x]),
    [(i['created_at'], i['tweet_id']) for i in jma_bousai[x][-1:-5:-1]]
)

217

[('Fri Oct 04 05:06:19 +0000 2019', 1179986079811719168),
 ('Sat Oct 05 00:28:18 +0000 2019', 1180278502618042368),
 ('Sat Oct 05 00:48:05 +0000 2019', 1180283482494259200),
 ('Sun Oct 06 00:26:56 +0000 2019', 1180640546353643521)]

##### actual result

In [152]:
x=2
jma_bousai[x] = miner.mine_user_tweets(user='jma_bousai', max_pages=20, last_tweet_id=1278950916482195456)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


#### result 1

In [151]:
display(
    len(jma_bousai[x]),
    [(i['created_at'], i['tweet_id']) for i in jma_bousai[x][-1:-5:-1]]
)

400

[('Fri Jul 03 07:17:14 +0000 2020', 1278950916482195456),
 ('Fri Jul 03 07:17:43 +0000 2020', 1278951035315171329),
 ('Fri Jul 03 07:18:15 +0000 2020', 1278951169583312906),
 ('Fri Jul 03 07:18:44 +0000 2020', 1278951291410984960)]

##### actual result

In [150]:
x=1
jma_bousai[x] = miner.mine_user_tweets(user='jma_bousai', max_pages=20, last_tweet_id=1426241703619403779)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


#### result 0

In [149]:
display(
    len(jma_bousai[x]),
    [(i['created_at'], i['tweet_id']) for i in jma_bousai[x][-1:-5:-1]]
)

400

[('Fri Aug 13 17:58:15 +0000 2021', 1426241703619403779),
 ('Fri Aug 13 18:43:39 +0000 2021', 1426253130807742464),
 ('Fri Aug 13 20:52:06 +0000 2021', 1426285454203121664),
 ('Fri Aug 13 21:13:30 +0000 2021', 1426290840901586946)]

##### actual result

In [148]:
x=0
jma_bousai[0] = miner.mine_user_tweets(user='jma_bousai', max_pages=20, last_tweet_id=False)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


### clean tweets to df

#### make dataframe

In [267]:
bousai = []
for i in jma_bousai:
    bousai = bousai + jma_bousai[i]
bousai = pd.DataFrame.from_records(bousai)

#### extract headline categories from fulltext as dummies

In [268]:
bousai = (bousai
    .assign(火_火山=lambda x: x.full_text.str.count(r'火山'))
    .assign(火_噴火警戒レベル１=lambda x: x.full_text.str.count(r'噴火警戒レベル１'))
    .assign(火_火口周辺規制=lambda x: x.full_text.str.count(r'火口周辺規制'))
    .assign(火_入山規制=lambda x: x.full_text.str.count(r'入山規制'))
      
    .assign(雨_警戒レベル５=lambda x: x.full_text.str.count(r'警戒レベル５'))
    .assign(雨_大雨=lambda x: x.full_text.str.count(r'大雨'))
    .assign(雨_土砂災害=lambda x: x.full_text.str.count(r'土砂災害'))

    .assign(震_地震=lambda x: x.full_text.str.count(r'地震'))
    .assign(震_震度5=lambda x: x.full_text.str.count(r'震度5'))
    .assign(震_震度678=lambda x: x.full_text.str.count(r'震度6|震度7|震度8'))
      
    .assign(雪_大雪=lambda x: x.full_text.str.count(r'大雪'))
    .assign(雪_路面凍結=lambda x: x.full_text.str.count(r'路面凍結'))
    
    .assign(津_津波=lambda x: x.full_text.str.count(r'津波'))
)


#### string to timestamp

In [269]:
def new_datetime(old_dt):
    return dt.strftime(dt.strptime(old_dt,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')
# https://stackoverflow.com/questions/7703865/going-from-twitter-date-to-python-datetime-date

In [270]:
bousai['dt_created'] = pd.to_datetime(bousai.created_at)
bousai = bousai.set_index('dt_created')

In [273]:
with open(processed_path + 'jma_bousai.pickle', 'wb') as f:
    pickle.dump(bousai, f)

## END