In [1]:
import json

import pandas as pd

from TwitterAPI import TwitterAPI

In [2]:
with open("../config/TwitterAPI.json") as f:
    config = json.loads(f.read())

api = TwitterAPI(config["main"])

In [3]:
start_date = "20230203"
end_date = "20230309"
query = 'has:geo (#Derailment OR "Vinyl chloride" OR "East Palestine")'


def get_tweet_geo(start_date: str, end_date: str, query: str):
    params = {"geo": "place_id"}
    df_raw = api.search_tweets(query, params=params, start_time=start_date, end_time=end_date) 
    df_tweets = df_raw.copy(deep=True)
    df_tweets["Date"] = pd.to_datetime(df_tweets["created_at"]).dt.strftime("%Y%m%d")
    df_tweets["geo"] = df_tweets["geo"].apply(lambda x:x["place_id"])
    df_tweets["text"] = api.parse_tweet(df_tweets["text"])
    df_tweets = df_tweets.set_index(["Date"])[["id", "text", "geo", "author_id"]]
    return df_tweets

df_tweets = get_tweet_geo(start_date, end_date, query)

In [21]:
geo = df_tweets["geo"].value_counts()
geo = geo[geo > 1].index

0be8f6b1807525ef    242
de599025180e2ee7    127
dd9c503d6c35364b    124
3df0e3eb1e91170b     86
3b77caf94bfc81fe     84
                   ... 
de7c3cc0109528d9      1
25d1ef4322ba8e3c      1
4e05bd43145ae323      1
2203fea4fe2288d3      1
018861ac17da0567      1
Name: geo, Length: 1358, dtype: int64

In [43]:
import time
geo = df_tweets["geo"].value_counts()
geo = geo[geo == 1].index
geo_info = dict()
for place_id in geo:
    while True:
        info = api.search_geo(place_id)
        if "full_name" in info:
            geo_info[place_id] = info
            break
        else:
            time.sleep(15 * 60)


In [96]:
start_date = "20230203"
end_date = "20230309"
params = {"geo": "place_id"}
query = 'has:geo (#Derailment OR "Vinyl chloride" OR "East Palestine")'
df_geo = api.search_tweets(query, params=params, start_time=start_date, end_time=end_date) 
df_geo.to_pickle("../data/raw_data/geo_tweet.pkl", compression="gzip")
df_geo["Date"] = pd.to_datetime(df_geo["created_at"]).dt.strftime("%Y%m%d")
df_geo["geo"] = df_geo["geo"].apply(lambda x:x["place_id"])
df_geo["text"] = api.parse_tweet(df_geo["text"])
df_geo = df_geo.set_index(["Date"])[["id", "text", "geo", "author_id"]]

  series = series.str.replace(r"(@[\w|\d]+|\#[\w|\d]+|https\S+)", " ")
  series = series.str.replace(s, "")
  return series.str.replace(r"\s+", " ")


In [None]:
start_date = "20230203"
end_date = "20230309"
params = {"geo": "place_id"}
query = '(#Derailment OR "Vinyl chloride" OR "East Palestine")'
for date in pd.date_range(start_date, end_date):
    start_time = date.strftime("%Y%m%d")
    end_time = (date + pd.Timedelta(1, "day")).strftime("%Y%m%d")
    result = api.search_tweets(query, params=params, start_time=start_time, end_time=end_time) 
    result.to_pickle(f"../data/{start_time}_data.pkl", compression="gzip")

In [11]:
start_date = "20230203"
end_date = "20230309"
dfs = list()
for date in pd.date_range(start_date, end_date):
    date = date.strftime("%Y%m%d")
    df = pd.read_pickle(f"../data/raw_data/{date}_data.pkl", compression="gzip")[["text", "id", "author_id", "created_at"]]
    df["created_at"] = pd.to_datetime(df["created_at"])
    dfs.append(df)
dfs = pd.concat(dfs)
dfs.index = pd.DatetimeIndex(dfs["created_at"]).strftime("%Y%m%d")
dfs = dfs.drop(["created_at"], axis=1)
dfs.index.name = "Date"
dfs["text"] = TwitterAPI.parse_tweet(dfs["text"])
dfs.to_csv("../data/RelatedTweets.csv", sep="\t")

In [56]:
df = pd.read_csv("../data/RelatedTweets.csv", sep="\t", index_col="Date").dropna()