In [475]:
import os
import csv
import json
import numpy as np
import pandas as pd
import geopandas as gpd
import pycountry
from datetime import datetime
from pprint import pprint
from shapely.geometry import box

## Tempo tweet

In [2]:
file_path = "../../extract/output/twitter_tweet_tempodotco.json"

In [3]:
pd.read_json(file_path)

Unnamed: 0,data
text,#TempoThread\n\nDanantara adalah kapitalisme a...
id,1892923628037021960
edit_history_tweet_ids,[1892923628037021960]


### Word cleaning

In [4]:
with open(file_path, "r") as f:
  data = json.loads(f.read())

In [5]:
data

{'data': {'text': '#TempoThread\n\nDanantara adalah kapitalisme ala Prabowo, sebuah langkah ambisius untuk mewujudkan cita-cita ayahnya yang sudah dirancang sejak 1980-an.\n\nA Thread https://t.co/nUHckLhqrS',
  'id': '1892923628037021960',
  'edit_history_tweet_ids': ['1892923628037021960']}}

In [6]:
data["data"]

{'text': '#TempoThread\n\nDanantara adalah kapitalisme ala Prabowo, sebuah langkah ambisius untuk mewujudkan cita-cita ayahnya yang sudah dirancang sejak 1980-an.\n\nA Thread https://t.co/nUHckLhqrS',
 'id': '1892923628037021960',
 'edit_history_tweet_ids': ['1892923628037021960']}

In [7]:
text = data["data"]["text"]

text

'#TempoThread\n\nDanantara adalah kapitalisme ala Prabowo, sebuah langkah ambisius untuk mewujudkan cita-cita ayahnya yang sudah dirancang sejak 1980-an.\n\nA Thread https://t.co/nUHckLhqrS'

In [8]:
raw_words = text.split()

raw_words

['#TempoThread',
 'Danantara',
 'adalah',
 'kapitalisme',
 'ala',
 'Prabowo,',
 'sebuah',
 'langkah',
 'ambisius',
 'untuk',
 'mewujudkan',
 'cita-cita',
 'ayahnya',
 'yang',
 'sudah',
 'dirancang',
 'sejak',
 '1980-an.',
 'A',
 'Thread',
 'https://t.co/nUHckLhqrS']

In [9]:
raw_words[5]

'Prabowo,'

In [10]:
cleaned_words = [word.rstrip(',.') for word in raw_words]

cleaned_words

['#TempoThread',
 'Danantara',
 'adalah',
 'kapitalisme',
 'ala',
 'Prabowo',
 'sebuah',
 'langkah',
 'ambisius',
 'untuk',
 'mewujudkan',
 'cita-cita',
 'ayahnya',
 'yang',
 'sudah',
 'dirancang',
 'sejak',
 '1980-an',
 'A',
 'Thread',
 'https://t.co/nUHckLhqrS']

In [11]:
cleaned_words[5]

'Prabowo'

In [12]:
len(cleaned_words)

21

### Create datetime

In [13]:
created_time = os.path.getctime(file_path)

created_time

1740588991.929889

In [14]:
created_dt = datetime.fromtimestamp(created_time)

created_dt

datetime.datetime(2025, 2, 26, 23, 56, 31, 929889)

### Save clean data

In [15]:
new_data = {
  "tweet_id": data["data"]["id"],
  "word_count": len(cleaned_words),
  "words": cleaned_words,
  "created_at": str(created_dt),
}

In [16]:
with open("output/word_counts_tempodotco.csv", "w", newline="", encoding="utf-8") as f:
  field_names = ["tweet_id", "word_count", "words", "created_at"]
  writer = csv.DictWriter(f, fieldnames=field_names)

  if f.tell() == 0:
    writer.writeheader()

  writer.writerow(new_data)

In [17]:
df = pd.read_csv("output/word_counts_tempodotco.csv")

df

Unnamed: 0,tweet_id,word_count,words,created_at
0,1892923628037021960,21,"['#TempoThread', 'Danantara', 'adalah', 'kapit...",2025-02-26 23:56:31.929889


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_id    1 non-null      int64 
 1   word_count  1 non-null      int64 
 2   words       1 non-null      object
 3   created_at  1 non-null      object
dtypes: int64(2), object(2)
memory usage: 164.0+ bytes


In [19]:
with open("output/word_counts_tempodotco.json", "w", encoding="utf-8") as f:
  f.write(json.dumps(new_data, indent=2))

In [21]:
df2 = pd.read_json("output/word_counts_tempodotco.json")

df2.head()

Unnamed: 0,tweet_id,word_count,words,created_at
0,1892923628037021952,21,#TempoThread,2025-02-26 23:56:31.929889
1,1892923628037021952,21,Danantara,2025-02-26 23:56:31.929889
2,1892923628037021952,21,adalah,2025-02-26 23:56:31.929889
3,1892923628037021952,21,kapitalisme,2025-02-26 23:56:31.929889
4,1892923628037021952,21,ala,2025-02-26 23:56:31.929889


In [22]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   tweet_id    21 non-null     int64         
 1   word_count  21 non-null     int64         
 2   words       21 non-null     object        
 3   created_at  21 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 804.0+ bytes


## Ferry Irwandy tweet

Data cleaning with only python

In [347]:
file_path = "../../extract/output/twitter_tweet_irwndfrry.json"

In [348]:
pd.read_json(file_path)

Unnamed: 0,data,includes
referenced_tweets,"[{'type': 'quoted', 'id': '1890992087614808342'}]",
entities,"{'urls': [{'start': 276, 'end': 299, 'url': 'h...",
text,"Banyak yang minta jelasin soal Danatara ini, o...",
id,1891503790353760292,
edit_history_tweet_ids,[1891503790353760292],
possibly_sensitive,False,
created_at,2025-02-17T15:03:40.000Z,
context_annotations,"[{'domain': {'id': '46', 'name': 'Business Tax...",
author_id,251088685,
public_metrics,"{'retweet_count': 36062, 'reply_count': 1299, ...",


In [349]:
with open(file_path, "r") as f:
  data = json.loads(f.read())

In [350]:
data["data"]

{'referenced_tweets': [{'type': 'quoted', 'id': '1890992087614808342'}],
 'entities': {'urls': [{'start': 276,
    'end': 299,
    'url': 'https://t.co/qqZBckrWoi',
    'expanded_url': 'https://twitter.com/tempodotco/status/1890992087614808342',
    'display_url': 'x.com/tempodotco/sta…'}]},
 'text': 'Banyak yang minta jelasin soal Danatara ini, oke tak jelasin dgn simpel, simak dgn santai , ya\n\ndulu BUMN kalau dapat dividen, sebagian dividennya dikasih ke APBN terus duitnya dipake buat belanja\n\nNah sekarang Danantara, duitnya gak dipake lgsg untuk belanja, tapi investasi https://t.co/qqZBckrWoi',
 'id': '1891503790353760292',
 'edit_history_tweet_ids': ['1891503790353760292'],
 'possibly_sensitive': False,
 'created_at': '2025-02-17T15:03:40.000Z',
 'context_annotations': [{'domain': {'id': '46',
    'name': 'Business Taxonomy',
    'description': 'Categories within Brand Verticals that narrow down the scope of Brands'},
   'entity': {'id': '1557696848252391426',
    'name': 'Fina

In [351]:
data["includes"]

{'tweets': [{'attachments': {'media_keys': ['3_1890991719925350400']},
   'entities': {'hashtags': [{'start': 190, 'end': 201, 'tag': 'Tempodotco'},
     {'start': 202, 'end': 212, 'tag': 'TempoPlus'}],
    'urls': [{'start': 213,
      'end': 236,
      'url': 'https://t.co/O9f1JbLDBa',
      'expanded_url': 'https://x.com/tempodotco/status/1890992087614808342/photo/1',
      'display_url': 'pic.x.com/O9f1JbLDBa',
      'media_key': '3_1890991719925350400'}]},
   'text': 'Bagi Prabowo, Danantara adalah proyek historis.\n\nAyahnya, Sumitro Djojohadikusumo, punya ide membangun sebuah lembaga pengelola 1-5 persen laba Badan Usaha Milik Negara pada akhir 1980-an.\n\n#Tempodotco #TempoPlus https://t.co/O9f1JbLDBa',
   'id': '1890992087614808342',
   'edit_history_tweet_ids': ['1890992087614808342'],
   'possibly_sensitive': False,
   'created_at': '2025-02-16T05:10:21.000Z',
   'context_annotations': [{'domain': {'id': '47',
      'name': 'Brand',
      'description': 'Brands and Companies

### Word cleaning

In [352]:
text = data["data"]["text"]
raw_words = text.split()

raw_words

['Banyak',
 'yang',
 'minta',
 'jelasin',
 'soal',
 'Danatara',
 'ini,',
 'oke',
 'tak',
 'jelasin',
 'dgn',
 'simpel,',
 'simak',
 'dgn',
 'santai',
 ',',
 'ya',
 'dulu',
 'BUMN',
 'kalau',
 'dapat',
 'dividen,',
 'sebagian',
 'dividennya',
 'dikasih',
 'ke',
 'APBN',
 'terus',
 'duitnya',
 'dipake',
 'buat',
 'belanja',
 'Nah',
 'sekarang',
 'Danantara,',
 'duitnya',
 'gak',
 'dipake',
 'lgsg',
 'untuk',
 'belanja,',
 'tapi',
 'investasi',
 'https://t.co/qqZBckrWoi']

In [353]:
uncleaned_words = [word.rstrip(',.') for word in raw_words]

uncleaned_words

['Banyak',
 'yang',
 'minta',
 'jelasin',
 'soal',
 'Danatara',
 'ini',
 'oke',
 'tak',
 'jelasin',
 'dgn',
 'simpel',
 'simak',
 'dgn',
 'santai',
 '',
 'ya',
 'dulu',
 'BUMN',
 'kalau',
 'dapat',
 'dividen',
 'sebagian',
 'dividennya',
 'dikasih',
 'ke',
 'APBN',
 'terus',
 'duitnya',
 'dipake',
 'buat',
 'belanja',
 'Nah',
 'sekarang',
 'Danantara',
 'duitnya',
 'gak',
 'dipake',
 'lgsg',
 'untuk',
 'belanja',
 'tapi',
 'investasi',
 'https://t.co/qqZBckrWoi']

In [354]:
uncleaned_words[15]

''

In [355]:
cleaned_words = list(filter(None, uncleaned_words))

cleaned_words

['Banyak',
 'yang',
 'minta',
 'jelasin',
 'soal',
 'Danatara',
 'ini',
 'oke',
 'tak',
 'jelasin',
 'dgn',
 'simpel',
 'simak',
 'dgn',
 'santai',
 'ya',
 'dulu',
 'BUMN',
 'kalau',
 'dapat',
 'dividen',
 'sebagian',
 'dividennya',
 'dikasih',
 'ke',
 'APBN',
 'terus',
 'duitnya',
 'dipake',
 'buat',
 'belanja',
 'Nah',
 'sekarang',
 'Danantara',
 'duitnya',
 'gak',
 'dipake',
 'lgsg',
 'untuk',
 'belanja',
 'tapi',
 'investasi',
 'https://t.co/qqZBckrWoi']

### Context Annotation cleaning

In [356]:
data["data"]["context_annotations"][0]

{'domain': {'id': '46',
  'name': 'Business Taxonomy',
  'description': 'Categories within Brand Verticals that narrow down the scope of Brands'},
 'entity': {'id': '1557696848252391426',
  'name': 'Financial Services Business',
  'description': 'Brands, companies, advertisers and every non-person handle with the profit intent related to Banks, Credit cards, Insurance, Investments, Stocks '}}

In [357]:
data["data"]["context_annotations"][0]["domain"]["id"]

'46'

In [358]:
data["data"]["context_annotations"][0]["domain"]["name"]

'Business Taxonomy'

In [359]:
data["data"]["context_annotations"][0]["domain"]["description"]

'Categories within Brand Verticals that narrow down the scope of Brands'

In [360]:
data["data"]["context_annotations"][0]["entity"]["id"]

'1557696848252391426'

In [361]:
data["data"]["context_annotations"][0]["entity"]["name"]

'Financial Services Business'

In [362]:
data["data"]["context_annotations"][0]["entity"]["description"]

'Brands, companies, advertisers and every non-person handle with the profit intent related to Banks, Credit cards, Insurance, Investments, Stocks '

In [363]:
context_annotations_database = []

for annotation in data["data"]["context_annotations"]:
  domain_id = annotation["domain"]["id"]
  entity_id = annotation["entity"]["id"]
  annotation_data = {
    "id": f"{domain_id}-{entity_id}",
    "domain_id": domain_id,
    "domain_name": annotation["domain"]["name"],
    "entity_id": entity_id,
    "entity_name": annotation["entity"]["name"],
  }

  try:
    annotation_data["domain_description"] = annotation["domain"]["description"]
  except:
    print(f"{annotation["domain"]["id"]} description is not available")

  try:
    annotation_data["entity_description"] = annotation["entity"]["description"]
  except:
    print(f"{annotation["entity"]["id"]} description is not available")

  # For main data
  context_annotations_database.append(annotation_data)

1536331416954253312 description is not available
1536331416954253312 description is not available


In [364]:
len(context_annotations_database)

5

In [365]:
context_annotations_database[0]

{'id': '46-1557696848252391426',
 'domain_id': '46',
 'domain_name': 'Business Taxonomy',
 'entity_id': '1557696848252391426',
 'entity_name': 'Financial Services Business',
 'domain_description': 'Categories within Brand Verticals that narrow down the scope of Brands',
 'entity_description': 'Brands, companies, advertisers and every non-person handle with the profit intent related to Banks, Credit cards, Insurance, Investments, Stocks '}

In [366]:
with open("output/context_annotations.csv", "w", newline="") as f:
  fieldnames = context_annotations_database[0].keys()
  writer = csv.DictWriter(f, fieldnames=fieldnames)

  if f.tell() == 0:
    writer.writeheader()

  writer.writerows(context_annotations_database)

In [367]:
context_annotations = [context["id"] for context in context_annotations_database]

context_annotations

['46-1557696848252391426',
 '46-1557697218248773632',
 '88-1536331416954253312',
 '131-1095391406816784384',
 '131-1536331416954253312']

### Language cleaning

In [368]:
data["data"]["lang"]

'in'

language iso-code reference: https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes

In [369]:
print(
  pycountry.languages.get(alpha_2="in")
)

None


In [370]:
print(
  pycountry.languages.get(alpha_2="id")
)

Language(alpha_2='id', alpha_3='ind', name='Indonesian', scope='I', type='L')


In [371]:
custom_mapping = {
  "in": "id"
}

def get_language_name(lang_code):
  lang_code = custom_mapping.get(lang_code, lang_code)
  
  try:
    return pycountry.languages.get(alpha_2=lang_code)
  except AttributeError:
    return None

In [372]:
get_language_name("id")

Language(alpha_2='id', alpha_3='ind', name='Indonesian', scope='I', type='L')

In [373]:
get_language_name("in")

Language(alpha_2='id', alpha_3='ind', name='Indonesian', scope='I', type='L')

In [374]:
language = get_language_name(data["data"]["lang"])
language_name = language.name
language_code = language.alpha_2

In [375]:
print(language_name, language_code)

Indonesian id


### Save clean data

In [376]:
with open("output/word_counts_irwndfrry.json", "w") as f:
  new_data = {
    "tweet_id": data["data"]["id"],
    "author_id": data["data"]["author_id"],
    "sensitive": data["data"]["possibly_sensitive"],
    "word_count": len(cleaned_words),
    "words": cleaned_words,
    "language_name": language_name,
    "language_code": language_code,
    "created_at": data["data"]["created_at"],
    "retweet_count": data["data"]["public_metrics"]["retweet_count"],
    "reply_count": data["data"]["public_metrics"]["reply_count"],
    "like_count": data["data"]["public_metrics"]["like_count"],
    "quote_count": data["data"]["public_metrics"]["quote_count"],
    "bookmark_count": data["data"]["public_metrics"]["bookmark_count"],
    "impression_count": data["data"]["public_metrics"]["impression_count"],
  }

  for index, context in enumerate(context_annotations):
    new_data[f"annotation_{index}"] = context

  try:
    for ref_tweet in data["data"]["referenced_tweets"]:
      if ref_tweet['type'] == "retweeted":
        new_data[f"reference_tweet_id_retweeted"] = ref_tweet['id']
      elif ref_tweet['type'] == "quoted":
        new_data[f"reference_tweet_id_quoted"] = ref_tweet['id']
      elif ref_tweet['type'] == "replied_to":
        new_data[f"reference_tweet_id_replied_to"] = ref_tweet['id']
  except:
    print("no reference tweet")

  f.write(json.dumps(new_data, indent=2))

In [377]:
pd.read_json("output/word_counts_irwndfrry.json").head()

Unnamed: 0,tweet_id,author_id,sensitive,word_count,words,language_name,language_code,created_at,retweet_count,reply_count,like_count,quote_count,bookmark_count,impression_count,annotation_0,annotation_1,annotation_2,annotation_3,annotation_4,reference_tweet_id_quoted
0,1891503790353760256,251088685,False,43,Banyak,Indonesian,id,2025-02-17 15:03:40+00:00,36062,1299,99718,1619,35057,5761847,46-1557696848252391426,46-1557697218248773632,88-1536331416954253312,131-1095391406816784384,131-1536331416954253312,1890992087614808320
1,1891503790353760256,251088685,False,43,yang,Indonesian,id,2025-02-17 15:03:40+00:00,36062,1299,99718,1619,35057,5761847,46-1557696848252391426,46-1557697218248773632,88-1536331416954253312,131-1095391406816784384,131-1536331416954253312,1890992087614808320
2,1891503790353760256,251088685,False,43,minta,Indonesian,id,2025-02-17 15:03:40+00:00,36062,1299,99718,1619,35057,5761847,46-1557696848252391426,46-1557697218248773632,88-1536331416954253312,131-1095391406816784384,131-1536331416954253312,1890992087614808320
3,1891503790353760256,251088685,False,43,jelasin,Indonesian,id,2025-02-17 15:03:40+00:00,36062,1299,99718,1619,35057,5761847,46-1557696848252391426,46-1557697218248773632,88-1536331416954253312,131-1095391406816784384,131-1536331416954253312,1890992087614808320
4,1891503790353760256,251088685,False,43,soal,Indonesian,id,2025-02-17 15:03:40+00:00,36062,1299,99718,1619,35057,5761847,46-1557696848252391426,46-1557697218248773632,88-1536331416954253312,131-1095391406816784384,131-1536331416954253312,1890992087614808320


### Re-cleaning data for the referenced tweet

In [378]:
data["includes"]["tweets"][0]

{'attachments': {'media_keys': ['3_1890991719925350400']},
 'entities': {'hashtags': [{'start': 190, 'end': 201, 'tag': 'Tempodotco'},
   {'start': 202, 'end': 212, 'tag': 'TempoPlus'}],
  'urls': [{'start': 213,
    'end': 236,
    'url': 'https://t.co/O9f1JbLDBa',
    'expanded_url': 'https://x.com/tempodotco/status/1890992087614808342/photo/1',
    'display_url': 'pic.x.com/O9f1JbLDBa',
    'media_key': '3_1890991719925350400'}]},
 'text': 'Bagi Prabowo, Danantara adalah proyek historis.\n\nAyahnya, Sumitro Djojohadikusumo, punya ide membangun sebuah lembaga pengelola 1-5 persen laba Badan Usaha Milik Negara pada akhir 1980-an.\n\n#Tempodotco #TempoPlus https://t.co/O9f1JbLDBa',
 'id': '1890992087614808342',
 'edit_history_tweet_ids': ['1890992087614808342'],
 'possibly_sensitive': False,
 'created_at': '2025-02-16T05:10:21.000Z',
 'context_annotations': [{'domain': {'id': '47',
    'name': 'Brand',
    'description': 'Brands and Companies'},
   'entity': {'id': '1194281356999573504

In [379]:
def clean_words(data):
  # Word cleaning
  text = data["text"]
  raw_words = text.split()
  uncleaned_words = [word.rstrip(',.') for word in raw_words]
  cleaned_words = list(filter(None, uncleaned_words))

  return cleaned_words
  

In [380]:
def clean_context_annotations(data):
  context_annotations_database = []

  for annotation in data["context_annotations"]:
    domain_id = annotation["domain"]["id"]
    entity_id = annotation["entity"]["id"]
    annotation_data = {
      "id": f"{domain_id}-{entity_id}",
      "domain_id": domain_id,
      "domain_name": annotation["domain"]["name"],
      "entity_id": entity_id,
      "entity_name": annotation["entity"]["name"],
    }

    try:
      annotation_data["domain_description"] = annotation["domain"]["description"]
    except:
      print(f"{annotation["domain"]["id"]} description is not available")

    try:
      annotation_data["entity_description"] = annotation["entity"]["description"]
    except:
      print(f"{annotation["entity"]["id"]} description is not available")

    # For main data
    context_annotations_database.append(annotation_data)

  # Save with append into csv
  with open("output/context_annotations.csv", "a", newline="") as f:
    fieldnames = context_annotations_database[0].keys()
    writer = csv.DictWriter(f, fieldnames=fieldnames)

    if f.tell() == 0:
      writer.writeheader()

    writer.writerows(context_annotations_database)

  return [context["id"] for context in context_annotations_database]
  

In [381]:
def clean_language(data):
  custom_mapping = {
    "in": "id"
  }

  def get_language_name(lang_code):
    lang_code = custom_mapping.get(lang_code, lang_code)
    
    try:
      return pycountry.languages.get(alpha_2=lang_code)
    except AttributeError:
      return None
  
  language = get_language_name(data["lang"])

  return (language.name, language.alpha_2)

In [382]:
def transform_tweet(data):
  cleaned_words = clean_words(data)
  language_name, language_code = clean_language(data)
  context_annotations = clean_context_annotations(data)

  with open(f"output/word_counts_{data["id"]}.json", "w") as f:
    new_data = {
      "tweet_id": data["id"],
      "author_id": data["author_id"],
      "sensitive": data["possibly_sensitive"],
      "word_count": len(cleaned_words),
      "words": cleaned_words,
      "language_name": language_name,
      "language_code": language_code,
      "created_at": data["created_at"],
      "retweet_count": data["public_metrics"]["retweet_count"],
      "reply_count": data["public_metrics"]["reply_count"],
      "like_count": data["public_metrics"]["like_count"],
      "quote_count": data["public_metrics"]["quote_count"],
      "bookmark_count": data["public_metrics"]["bookmark_count"],
      "impression_count": data["public_metrics"]["impression_count"],
    }

    for index, context in enumerate(context_annotations):
      new_data[f"annotation_{index}"] = context

    try:
      for ref_tweet in data["data"]["referenced_tweets"]:
        if ref_tweet['type'] == "retweeted":
          new_data[f"reference_tweet_id_retweeted"] = ref_tweet['id']
        elif ref_tweet['type'] == "quoted":
          new_data[f"reference_tweet_id_quoted"] = ref_tweet['id']
        elif ref_tweet['type'] == "replied_to":
          new_data[f"reference_tweet_id_replied_to"] = ref_tweet['id']
    except:
      print("no reference tweet")

    f.write(json.dumps(new_data, indent=2))

In [383]:
for item in data["includes"]["tweets"]:
  transform_tweet(item)

1536331416954253312 description is not available
1536331416954253312 description is not available
1328727386934820865 description is not available
1328727386934820865 description is not available
no reference tweet


## Ferry Irwandi User

Flatten data with pandas json normalize

In [131]:
file_path = "../../extract/output/twitter_user_irwndfrry.json"

In [132]:
pd.read_json(file_path)

Unnamed: 0,data
verified,False
profile_image_url,https://pbs.twimg.com/profile_images/188894133...
protected,False
username,irwndfrry
pinned_tweet_id,1874839670984753388
name,Ferry Irwandi
id,251088685
created_at,2011-02-12T12:18:46.000Z
public_metrics,"{'followers_count': 208273, 'following_count':..."
description,


In [138]:
with open(file_path, "r") as f:
  data = json.loads(f.read())

In [139]:
data

{'data': {'verified': False,
  'profile_image_url': 'https://pbs.twimg.com/profile_images/1888941332389068800/zDL30fow_normal.jpg',
  'protected': False,
  'username': 'irwndfrry',
  'pinned_tweet_id': '1874839670984753388',
  'name': 'Ferry Irwandi',
  'id': '251088685',
  'created_at': '2011-02-12T12:18:46.000Z',
  'public_metrics': {'followers_count': 208273,
   'following_count': 1053,
   'tweet_count': 33933,
   'listed_count': 78,
   'like_count': 2027,
   'media_count': 1463},
  'description': ''}}

In [140]:
df = pd.json_normalize(data["data"], sep='_')

df

Unnamed: 0,verified,profile_image_url,protected,username,pinned_tweet_id,name,id,created_at,description,public_metrics_followers_count,public_metrics_following_count,public_metrics_tweet_count,public_metrics_listed_count,public_metrics_like_count,public_metrics_media_count
0,False,https://pbs.twimg.com/profile_images/188894133...,False,irwndfrry,1874839670984753388,Ferry Irwandi,251088685,2011-02-12T12:18:46.000Z,,208273,1053,33933,78,2027,1463


In [143]:
df.to_csv("output/users.csv", index=False)

## Tweets with geotag

Data cleaning with numpy, pandas, geopandas

In [384]:
file_path = "../../extract/output/twitter_geotags.json"

In [None]:
pd.read_json(file_path)

In [386]:
with open(file_path, "r") as f:
  data = json.loads(f.read())

In [387]:
df = pd.json_normalize(data, sep="_")

df

Unnamed: 0,data,includes_places,includes_tweets,meta_newest_id,meta_oldest_id,meta_result_count
0,[{'text': 'Kota Makassar (ᨀᨚᨈ ᨆᨀᨔᨑ) sebelumnya...,"[{'place_type': 'city', 'id': 'ec6afdae43138dc...",[{'text': '#30DayMapChallenge 2024 Day 20: @op...,1896153702140088593,1894706768649023792,3


In [164]:
df["data"].apply(lambda row: pprint(row))

[{'author_id': '475862742',
  'created_at': '2025-03-02T11:00:46.000Z',
  'edit_history_tweet_ids': ['1896153702140088593'],
  'entities': {'hashtags': [{'end': 125, 'start': 114, 'tag': 'FaizGeotag'}],
               'urls': [{'display_url': 'x.com/zaenun_faiz/st…',
                         'end': 150,
                         'expanded_url': 'https://x.com/zaenun_faiz/status/1859072685105705311',
                         'start': 127,
                         'url': 'https://t.co/i9D72lPthV'}]},
  'geo': {'place_id': 'ec6afdae43138dcd'},
  'id': '1896153702140088593',
  'in_reply_to_user_id': '475862742',
  'lang': 'in',
  'possibly_sensitive': False,
  'public_metrics': {'bookmark_count': 0,
                     'impression_count': 6,
                     'like_count': 0,
                     'quote_count': 0,
                     'reply_count': 0,
                     'retweet_count': 0},
  'referenced_tweets': [{'id': '1859072685105705311', 'type': 'quoted'},
                     

0    None
Name: data, dtype: object

### Make list of data into dataframe row

In [388]:
df_data = df["data"].explode("data")

df_data

0    {'text': 'Kota Makassar (ᨀᨚᨈ ᨆᨀᨔᨑ) sebelumnya ...
1    {'text': 'Prambanan is a 9th-century Hindu tem...
2    {'text': 'Borobudur is a 9th-century Mahayana ...
Name: data, dtype: object

### Convert single column with dict into series column

In [389]:
df_data_series = df_data.apply(pd.Series)

df_data_series

Unnamed: 0,text,edit_history_tweet_ids,referenced_tweets,author_id,geo,lang,id,entities,in_reply_to_user_id,created_at,public_metrics,possibly_sensitive,attachments
0,Kota Makassar (ᨀᨚᨈ ᨆᨀᨔᨑ) sebelumnya bernama Uj...,[1896153702140088593],"[{'type': 'quoted', 'id': '1859072685105705311...",475862742,{'place_id': 'ec6afdae43138dcd'},in,1896153702140088593,"{'urls': [{'start': 127, 'end': 150, 'url': 'h...",475862742.0,2025-03-02T11:00:46.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",False,
1,Prambanan is a 9th-century Hindu temple compou...,[1894707636278628411],"[{'type': 'quoted', 'id': '1587346563805822983'}]",475862742,{'place_id': '1d4c6ad315aabd84'},en,1894707636278628411,"{'annotations': [{'start': 74, 'end': 83, 'pro...",,2025-02-26T11:14:37.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",False,
2,Borobudur is a 9th-century Mahayana Buddhist t...,[1894706768649023792],,475862742,{'place_id': '36cb70a6b6dc142a'},en,1894706768649023792,"{'annotations': [{'start': 0, 'end': 8, 'proba...",,2025-02-26T11:11:10.000Z,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",False,{'media_keys': ['3_1894706765788479488']}


### Remove edit_history, in_reply_user_id, attachments and entities

In [390]:
df_data_series = df_data_series.drop(columns=["edit_history_tweet_ids", "in_reply_to_user_id", "attachments", "entities"])

### Parse geo place id

In [391]:
df_data_series["geo"].apply(lambda row: row["place_id"])

0    ec6afdae43138dcd
1    1d4c6ad315aabd84
2    36cb70a6b6dc142a
Name: geo, dtype: object

In [392]:
df_data_series["place_id"] = df_data_series["geo"].apply(lambda row: row["place_id"])

In [393]:
df_data_series

Unnamed: 0,text,referenced_tweets,author_id,geo,lang,id,created_at,public_metrics,possibly_sensitive,place_id
0,Kota Makassar (ᨀᨚᨈ ᨆᨀᨔᨑ) sebelumnya bernama Uj...,"[{'type': 'quoted', 'id': '1859072685105705311...",475862742,{'place_id': 'ec6afdae43138dcd'},in,1896153702140088593,2025-03-02T11:00:46.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",False,ec6afdae43138dcd
1,Prambanan is a 9th-century Hindu temple compou...,"[{'type': 'quoted', 'id': '1587346563805822983'}]",475862742,{'place_id': '1d4c6ad315aabd84'},en,1894707636278628411,2025-02-26T11:14:37.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",False,1d4c6ad315aabd84
2,Borobudur is a 9th-century Mahayana Buddhist t...,,475862742,{'place_id': '36cb70a6b6dc142a'},en,1894706768649023792,2025-02-26T11:11:10.000Z,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",False,36cb70a6b6dc142a


In [394]:
df_data_series = df_data_series.drop(columns=["geo"])

### Clean language

In [395]:
custom_mapping = {
  "in": "id"
}

def get_language_name(lang_code):
  lang_code = custom_mapping.get(lang_code, lang_code)
  
  try:
    return pycountry.languages.get(alpha_2=lang_code)
  except AttributeError:
    return None

In [396]:
df_data_series["language_name"] = df_data_series["lang"].apply(lambda row: get_language_name(row).name)
df_data_series["language_code"] = df_data_series["lang"].apply(lambda row: get_language_name(row).alpha_2)

In [397]:
df_data_series = df_data_series.drop(columns=["lang"])

### Rename id to tweet_id

In [400]:
df_data_series = df_data_series.rename(
  columns={"id": "tweet_id", "possibly_sensitive": "sensitive"}
)

### Spread public metrics

In [402]:
df_metrics_series = df_data_series["public_metrics"].apply(pd.Series)

df_metrics_series

Unnamed: 0,retweet_count,reply_count,like_count,quote_count,bookmark_count,impression_count
0,0,0,0,0,0,6
1,0,0,0,0,0,84
2,0,1,0,0,0,52


In [403]:
df_data_series = df_data_series.join(df_metrics_series)

In [405]:
df_data_series = df_data_series.drop(columns=["public_metrics"])

### Spread reference tweets

In [407]:
df_data_series["referenced_tweets"]

0    [{'type': 'quoted', 'id': '1859072685105705311...
1    [{'type': 'quoted', 'id': '1587346563805822983'}]
2                                                  NaN
Name: referenced_tweets, dtype: object

In [435]:
def set_referenced_tweet_type(row, type):
  if row is not np.nan:
    for item in row:
      if type == item["type"]:
        return item["id"]

df_data_series["referenced_tweets"].apply(
  lambda row: set_referenced_tweet_type(row, type="quoted")
)

0    1859072685105705311
1    1587346563805822983
2                   None
Name: referenced_tweets, dtype: object

In [436]:
df_data_series["reference_tweet_id_retweeted"] = df_data_series["referenced_tweets"].apply(
  lambda row: set_referenced_tweet_type(row, type="retweeted")
)

df_data_series["reference_tweet_id_quoted"] = df_data_series["referenced_tweets"].apply(
  lambda row: set_referenced_tweet_type(row, type="quoted")
)

df_data_series["reference_tweet_id_replied_to"] = df_data_series["referenced_tweets"].apply(
  lambda row: set_referenced_tweet_type(row, type="replied_to")
)

In [439]:
df_data_series = df_data_series.drop(columns=["referenced_tweets"])

### Add Geoseries data

In [448]:
df_data_series

Unnamed: 0,text,author_id,tweet_id,created_at,sensitive,place_id,language_name,language_code,retweet_count,reply_count,like_count,quote_count,bookmark_count,impression_count,reference_tweet_id_retweeted,reference_tweet_id_quoted,reference_tweet_id_replied_to
0,Kota Makassar (ᨀᨚᨈ ᨆᨀᨔᨑ) sebelumnya bernama Uj...,475862742,1896153702140088593,2025-03-02T11:00:46.000Z,False,ec6afdae43138dcd,Indonesian,id,0,0,0,0,0,6,,1.8590726851057052e+18,1.894706768649024e+18
1,Prambanan is a 9th-century Hindu temple compou...,475862742,1894707636278628411,2025-02-26T11:14:37.000Z,False,1d4c6ad315aabd84,English,en,0,0,0,0,0,84,,1.5873465638058227e+18,
2,Borobudur is a 9th-century Mahayana Buddhist t...,475862742,1894706768649023792,2025-02-26T11:11:10.000Z,False,36cb70a6b6dc142a,English,en,0,1,0,0,0,52,,,


In [445]:
df_places = df["includes_places"].explode("includes_places")

In [451]:
df_places_series = df_places.apply(pd.Series)

In [452]:
df_places_series

Unnamed: 0,place_type,id,name,full_name,country,geo,country_code
0,city,ec6afdae43138dcd,Makasar,"Makasar, Indonesia",Indonesia,"{'type': 'Feature', 'bbox': [106.882369, -6.30...",ID
1,city,1d4c6ad315aabd84,Prambanan,"Prambanan, Indonesia",Indonesia,"{'type': 'Feature', 'bbox': [110.464824, -7.83...",ID
2,city,36cb70a6b6dc142a,Borobudur,"Borobudur, Indonesia",Indonesia,"{'type': 'Feature', 'bbox': [110.150183, -7.65...",ID


### convert bbox into polygon

In [459]:
def convert_bbox_to_polygon(row):
  # return row["bbox"]
  return box(*row["bbox"])

df_places_series["geo"].apply(convert_bbox_to_polygon)

0    POLYGON ((106.935071 -6.303373, 106.935071 -6....
1    POLYGON ((110.544218 -7.831231, 110.544218 -7....
2    POLYGON ((110.250073 -7.653435, 110.250073 -7....
Name: geo, dtype: object

In [460]:
df_places_series["polygon"] = df_places_series["geo"].apply(convert_bbox_to_polygon)

### Remove place_id and geo

In [463]:
df_places_series = df_places_series.drop(columns=["id", "geo"])

### Add place prefix for geoseries

In [464]:
df_places_series

Unnamed: 0,place_type,name,full_name,country,country_code,polygon
0,city,Makasar,"Makasar, Indonesia",Indonesia,ID,"POLYGON ((106.935071 -6.303373, 106.935071 -6...."
1,city,Prambanan,"Prambanan, Indonesia",Indonesia,ID,"POLYGON ((110.544218 -7.831231, 110.544218 -7...."
2,city,Borobudur,"Borobudur, Indonesia",Indonesia,ID,"POLYGON ((110.250073 -7.653435, 110.250073 -7...."


In [467]:
df_places_series = df_places_series.rename(columns={"place_type": "type"})

In [470]:
df_places_series = df_places_series.add_prefix("place_")

### Merge Data and Place

In [471]:
df_data_series

Unnamed: 0,text,author_id,tweet_id,created_at,sensitive,place_id,language_name,language_code,retweet_count,reply_count,like_count,quote_count,bookmark_count,impression_count,reference_tweet_id_retweeted,reference_tweet_id_quoted,reference_tweet_id_replied_to
0,Kota Makassar (ᨀᨚᨈ ᨆᨀᨔᨑ) sebelumnya bernama Uj...,475862742,1896153702140088593,2025-03-02T11:00:46.000Z,False,ec6afdae43138dcd,Indonesian,id,0,0,0,0,0,6,,1.8590726851057052e+18,1.894706768649024e+18
1,Prambanan is a 9th-century Hindu temple compou...,475862742,1894707636278628411,2025-02-26T11:14:37.000Z,False,1d4c6ad315aabd84,English,en,0,0,0,0,0,84,,1.5873465638058227e+18,
2,Borobudur is a 9th-century Mahayana Buddhist t...,475862742,1894706768649023792,2025-02-26T11:11:10.000Z,False,36cb70a6b6dc142a,English,en,0,1,0,0,0,52,,,


In [472]:
df_places_series

Unnamed: 0,place_type,place_name,place_full_name,place_country,place_country_code,place_polygon
0,city,Makasar,"Makasar, Indonesia",Indonesia,ID,"POLYGON ((106.935071 -6.303373, 106.935071 -6...."
1,city,Prambanan,"Prambanan, Indonesia",Indonesia,ID,"POLYGON ((110.544218 -7.831231, 110.544218 -7...."
2,city,Borobudur,"Borobudur, Indonesia",Indonesia,ID,"POLYGON ((110.250073 -7.653435, 110.250073 -7...."


In [474]:
df_merge = df_data_series.join(df_places_series)

### Make the dataframe into geodataframe

In [479]:
df_merge["place_polygon"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 3 entries, 0 to 2
Series name: place_polygon
Non-Null Count  Dtype 
--------------  ----- 
3 non-null      object
dtypes: object(1)
memory usage: 156.0+ bytes


In [541]:
gdf = gpd.GeoDataFrame(df_merge, geometry="place_polygon", crs=4326)

In [542]:
gdf["place_polygon"].info()

<class 'geopandas.geoseries.GeoSeries'>
RangeIndex: 3 entries, 0 to 2
Series name: place_polygon
Non-Null Count  Dtype   
--------------  -----   
3 non-null      geometry
dtypes: geometry(1)
memory usage: 156.0 bytes


In [543]:
gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

### Word cleaning

In [544]:
raw_words = gdf["text"].apply(lambda row: row.split())

In [545]:
raw_words.iloc[0]

['Kota',
 'Makassar',
 '(ᨀᨚᨈ',
 'ᨆᨀᨔᨑ)',
 'sebelumnya',
 'bernama',
 'Ujung',
 'Pandang,',
 'adalah',
 'ibu',
 'kota',
 'provinsi',
 'Sulawesi',
 'Selatan,',
 'Indonesia.',
 '#FaizGeotag',
 'https://t.co/i9D72lPthV']

In [546]:
def cleaning_words(row):
  trailing_last = [word.rstrip(',.()') for word in row]
  trailing_first = [word.lstrip(',.()') for word in trailing_last]
  return list(filter(None, trailing_first))

raw_words.apply(cleaning_words).iloc[0]

['Kota',
 'Makassar',
 'ᨀᨚᨈ',
 'ᨆᨀᨔᨑ',
 'sebelumnya',
 'bernama',
 'Ujung',
 'Pandang',
 'adalah',
 'ibu',
 'kota',
 'provinsi',
 'Sulawesi',
 'Selatan',
 'Indonesia',
 '#FaizGeotag',
 'https://t.co/i9D72lPthV']

In [547]:
cleaned_words = raw_words.apply(cleaning_words)

In [549]:
gdf["words"] = cleaned_words

In [555]:
gdf["word_count"] = cleaned_words.str.len()

In [561]:
gdf = gdf.drop(columns=["text"])

In [562]:
gdf

Unnamed: 0,author_id,tweet_id,created_at,sensitive,place_id,language_name,language_code,retweet_count,reply_count,like_count,...,reference_tweet_id_quoted,reference_tweet_id_replied_to,place_type,place_name,place_full_name,place_country,place_country_code,place_polygon,words,word_count
0,475862742,1896153702140088593,2025-03-02T11:00:46.000Z,False,ec6afdae43138dcd,Indonesian,id,0,0,0,...,1.8590726851057052e+18,1.894706768649024e+18,city,Makasar,"Makasar, Indonesia",Indonesia,ID,"POLYGON ((106.93507 -6.30337, 106.93507 -6.241...","[Kota, Makassar, ᨀᨚᨈ, ᨆᨀᨔᨑ, sebelumnya, bernam...",17
1,475862742,1894707636278628411,2025-02-26T11:14:37.000Z,False,1d4c6ad315aabd84,English,en,0,0,0,...,1.5873465638058227e+18,,city,Prambanan,"Prambanan, Indonesia",Indonesia,ID,"POLYGON ((110.54422 -7.83123, 110.54422 -7.711...","[Prambanan, is, a, 9th-century, Hindu, temple,...",38
2,475862742,1894706768649023792,2025-02-26T11:11:10.000Z,False,36cb70a6b6dc142a,English,en,0,1,0,...,,,city,Borobudur,"Borobudur, Indonesia",Indonesia,ID,"POLYGON ((110.25007 -7.65344, 110.25007 -7.572...","[Borobudur, is, a, 9th-century, Mahayana, Budd...",26


In [563]:
gdf = gdf.explode("words")

In [564]:
gdf.to_file("output/word_counts_geotags.geojson")

## Merge all word counts dataset

In [565]:
df1 = pd.read_json("output/word_counts_1890992087614808342.json")

In [566]:
df2 = pd.read_json("output/word_counts_irwndfrry.json")

In [567]:
df3 = pd.read_json("output/word_counts_tempodotco.json")

In [568]:
gdf1 = gpd.read_file("output/word_counts_geotags.geojson")

### Concatenate dataframe

In [569]:
merge_df = pd.concat([df1, df2, df3, gdf1], ignore_index=True)

### Convert it into Geodataframe

In [570]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   tweet_id                       173 non-null    object  
 1   author_id                      152 non-null    object  
 2   sensitive                      152 non-null    object  
 3   word_count                     173 non-null    int64   
 4   words                          173 non-null    object  
 5   language_name                  152 non-null    object  
 6   language_code                  152 non-null    object  
 7   created_at                     173 non-null    object  
 8   retweet_count                  152 non-null    float64 
 9   reply_count                    152 non-null    float64 
 10  like_count                     152 non-null    float64 
 11  quote_count                    152 non-null    float64 
 12  bookmark_count                 152 n