In [28]:
from typing import Any, Dict

import pandas as pd

In [54]:
def extract_url(tweet: pd.Series, url_dict: Dict[str, int]):
    if isinstance(tweet["entities"], dict):
        for url_entity in tweet["entities"].get("urls", list()):
            url = url_entity["expanded_url"]
            if url in url_dict:
                url_dict[url]["Count"] += 1
            else:
                url_dict[url] = {"Count": 1, "CreatedAt": tweet["created_at"]}

In [55]:
start_date = "20230203"
end_date = "20230228"
tweet_count = dict()
url_dict = dict()

for date in pd.date_range(start_date, end_date):
    date = date.strftime("%Y%m%d")
    df = pd.read_pickle(f"../data/twitter/pkl/{date}_data.pkl", compression="gzip").sort_values(["created_at"])
    tweet_count[date] = len(df)
    df.apply(lambda tweet: extract_url(tweet, url_dict), axis=1)
tweet_count = pd.Series(tweet_count).sort_index()
tweet_count.index = pd.DatetimeIndex(tweet_count.index)
tweet_count.plot(figsize=(10, 5), grid=True)

In [63]:
df_urls = pd.DataFrame({key: [val["Count"], val["CreatedAt"]] for key, val in url_dict.items()}).T
df_urls.columns = ["Count", "CreatedAt"]
df_urls = df_urls.sort_values(["Count"], ascending=False)
df_urls["Domain"] = df_urls.apply(lambda url:"/".join(url.name.split("/")[:3]), axis=1)
df_urls.index.name = "URL"
df_urls

Unnamed: 0,Count,CreatedAt,Domain
https://twitter.com/nicksortor/status/1626327835668320256/video/1,15082,2023-02-16T21:09:06.000Z,https://twitter.com
https://twitter.com/Ultrademic/status/1624958707330056192/video/1,10991,2023-02-13T02:28:40.000Z,https://twitter.com
https://twitter.com/realstewpeters/status/1625243042029400064/video/1,10478,2023-02-13T23:04:45.000Z,https://twitter.com
https://twitter.com/JackPosobiec/status/1628430000050274310/video/1,9517,2023-02-22T16:22:21.000Z,https://twitter.com
https://twitter.com/realstewpeters/status/1626377177829715968/photo/1,9280,2023-02-17T00:25:10.000Z,https://twitter.com
...,...,...,...
https://www.i24news.tv/en/news/international/americas/1676666440-east-palestine-trainwreck-ohio-locals-concerned-about-leaked-toxins?utm_medium=Social&utm_source=Twitter#Echobox=1676666840,1,2023-02-18T01:34:14.000Z,https://www.i24news.tv
https://www.knopnews2.com/2023/02/18/donald-trump-visit-east-palestine-ohio-after-toxic-train-derailment-son-says/?utm_campaign=snd-autopilot&utm_source=twitter&utm_medium=social&utm_campaign=snd&utm_content=knop,1,2023-02-18T01:34:02.000Z,https://www.knopnews2.com
https://eu.dispatch.com/story/news/local/2023/02/14/is-the-ohio-river-contaminated-east-palestine-train-derailment-sparks-concerns-over-water/69900528007/,1,2023-02-18T01:33:59.000Z,https://eu.dispatch.com
https://twitter.com/xformed/status/1626741217407950848,1,2023-02-18T01:33:50.000Z,https://twitter.com


In [None]:
df_urls.to_csv("../data/twitter/URL.csv", sep="\t", index_label="URL")

In [83]:
df_urls.groupby(["Domain"]).count().sort_values(["Count"], ascending=False).head(5)

Unnamed: 0_level_0,Count,CreatedAt
Domain,Unnamed: 1_level_1,Unnamed: 2_level_1
https://twitter.com,59348,59348
http://dlvr.it,2218,2218
https://youtu.be,1521,1521
https://www.msn.com,1409,1409
https://ift.tt,1366,1366


In [84]:
df_urls[df_urls["Domain"].str.startswith("https://www.msn.com")].head(5)

Unnamed: 0_level_0,Count,CreatedAt,Domain
URL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://www.msn.com/en-us/news/other/trump-to-visit-east-palestine-after-toxic-train-derailment/ar-AA17D1U5,256,2023-02-17T23:41:42.000Z,https://www.msn.com
https://www.msn.com/en-us/news/politics/ohio-train-derailment-trump-to-donate-supplies-during-visit-to-east-palestine/ar-AA17O3Z1,55,2023-02-22T18:34:21.000Z,https://www.msn.com
https://www.msn.com/en-us/news/politics/trump-raises-eyebrows-with-comment-about-east-palestine-mayor-he-s-very-easy-to-find/ar-AA17OwIE?ocid=msedgntp&cvid=3b4bee7a689548049b89ee7c7be2bbcf,34,2023-02-23T12:52:55.000Z,https://www.msn.com
https://www.msn.com/en-us/news/us/fema-withholds-help-as-fed-assistance-sought-for-east-palestine/ar-AA17zKEv,29,2023-02-17T04:06:24.000Z,https://www.msn.com
https://www.msn.com/en-us/news/politics/trump-who-rolled-back-rail-regulations-slams-biden-during-visit-to-east-palestine-after-train-derailment/ar-AA17Ozfz?ocid=msedgntp&cvid=dce4f03107724a35b33c16e24ae38b51,24,2023-02-23T01:28:25.000Z,https://www.msn.com
