In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
SNA_PROJECT_PATH = "drive/MyDrive/SNA_Project"

#### Installations:

In [None]:
!pip install ijson -q

In [None]:
!pip install fastparquet -q

#### Imports:

In [None]:
import pandas as pd
import polars as pl
import numpy as np

In [None]:
import os
import ijson
from typing import List, Any, Dict
from tqdm.notebook import tqdm
import fastparquet

# 🔎 Exploring the Dataset

In [None]:
!ls $SNA_PROJECT_PATH/TwiBot-22

In [None]:
def explore(filepath, type, max=None):
  if type == 'csv':
    dataset = pd.read_csv(filepath)
  elif type == 'json':
    dataset = pd.read_json(filepath, nrows=max)
  print(f"Dataset shape is {dataset.shape}")
  return dataset

## Users dataset

In [None]:
explore(f"{SNA_PROJECT_PATH}/TwiBot-22/user.json", 'json')

## Labels dataset

In [None]:
explore(f"{SNA_PROJECT_PATH}/TwiBot-22/label.csv", 'csv')

## List dataset

In [None]:
lists = explore(f"{SNA_PROJECT_PATH}/TwiBot-22/list.json", 'json')
lists = lists.sort_values(by=['follower_count', 'member_count'], ascending=False, axis=0)
lists.head(100)

## Hashtag dataset

In [None]:
explore(f"{SNA_PROJECT_PATH}/TwiBot-22/hashtag.json", 'json')

## Split dataset

In [None]:
explore(f"{SNA_PROJECT_PATH}/TwiBot-22/split.csv", 'csv')

## Edge Dataset


In [None]:
edges = pl.read_csv(f"{SNA_PROJECT_PATH}/TwiBot-22/edge.csv", new_columns=['source', 'relation', 'target'], n_rows=66000633, skip_rows=94328880)
edges_hash = edges.filter(pl.col("relation")=="discuss")
edges_hash.head(10000000)

In [None]:
grouped_eh = edges_hash.group_by("target").agg(pl.col("source").str.join(","))
grouped_eh.head(10)

### 'discuss' rows count:


*   0 to 10000000:
*   10000000 to 20000000:
*   20000000 to 30000000:
*   30000000 to 40000000:
*   40000000 to 50000000:
*   50000000 to 60000000:
*   60000000 to 70000000:
*   70000000 to 80000000:
*   80000000 to 90000000:
*   90000000 to 100000000:  5.671.120
*   100000000 to 110000000: 10.000.000
*   110000000 to 120000000: 10.000.000
*   120000000 to 130000000: 10.000.000
*   130000000 to 140000000: 10.000.000
*   140000000 to 150000000: 10.000.000
*   150000000 to 160000000: 10.000.000
*   160000000 to 170000000: 329.513
*   170000000 to end:

There are a total of 66.000.633 "discuss" entries in the edge dataset, comprised between indeces 90.000.000 and 170.000.000.

## Twitter dataset (split 0)

The Tweet_i datasets seem to be constitued by one giant line, without '\n' characters. The structure is the following: <br>
[{json_Object_1} , ..., {json_Object_n}]

In [None]:
def read_n_instances(filename, n):
  i=0
  file = open(filename, "r")
  square = file.read(1)
  instances = []
  instance = ''
  start = '{"attachments":'
  while(i<n):
    while(not instance.endswith(', {"attachments":', 18)):
      char = file.read(1)
      instance += char
    instances.append(instance[0:-17])
    instance=start
    i+=1
  file.close()
  return instances


In [None]:
res = read_n_instances(f"{SNA_PROJECT_PATH}/TwiBot-22/tweet_0.json", 100)
for i in res:
  print(i)

In [None]:
df_inter = pd.DataFrame(res)
#df_inter.columns = ['attachments', 'author_id', 'context_annotations', 'conversation_id', 'created_at', 'entities', 'geo', 'id', 'in_reply_to_user_id', 'lang', 'possibly_sensitive', 'public_metrics', 'referenced_tweets', 'reply_sttings', 'source', 'text', 'withheld']
df_inter.columns = ['json_element']

import json
df_inter['json_element'].apply(json.loads)

df_final = pd.json_normalize(df_inter['json_element'].apply(json.loads))
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
df_final=df_final.sort_values(by='entities.hashtags', ascending=False , axis=0, key=lambda col: [len(i) for i in col])
df_final.head(10)

# ✂︎ Splitting into chunks
https://github.com/LuoUndergradXJTU/TwiBot-22/issues/17

## Parsing Tweets

In [None]:
class TweetsParser:
  """
  This class parses large tweet JSON files, extracts relevant information,
  and saves them into smaller Parquet files for efficient processing.
  """
  def __init__(self, tweets_path: str, batch: int=0, chunk_size: int=1000000) -> None:
    # tweets_path: path of the large tweet chunk
    # chunk_size: size of the mini chunks
    # batch: offset of the chunk indices
    self.tweets_path = tweets_path
    self.chunk_size = chunk_size
    self.batch = batch

  def change_tweets_path(self, new_tweets_path):
    self.tweets_path = new_tweets_path

  def _extract_hashtags(self, entity: Dict) -> List:
    if not entity or 'hashtags' not in entity:
        return []
    return entity.get('hashtags', [])

  def _save_mini_chunk(self, records: List[Any], chunk_number: int, output_dir: str=f"{SNA_PROJECT_PATH}/tweet_chunks"):
      os.makedirs(output_dir, exist_ok=True)

      df = pd.DataFrame(records)

      df['hashtags'] = df['entities'].apply(self._extract_hashtags)
      cols_to_drop = ['attachments', 'context_annotations', 'conversation_id', 'created_at', 'geo', 'id', 'lang', 'possibly_sensitive', 'referenced_tweets', 'reply_settings', 'source', 'text', 'withheld', 'entities', 'public_metrics']
      df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])

      output_path = os.path.join(output_dir, f"tweet_chunk_{chunk_number}.parquet")
      df.to_parquet(output_path, compression='snappy', index=False)

      print(f"Saved chunk {chunk_number} with {len(records)} records to {output_path}")

  def parse(self):
      with open(self.tweets_path, 'r') as f:
          data = ijson.items(f, 'item')

          records = []
          chunk_count = 10*self.batch
          for item in tqdm(data, desc="Parsing tweets", unit=" tweets"):
              records.append(item)
              if len(records) >= self.chunk_size:
                  self._save_mini_chunk(records, chunk_count)
                  chunk_count += 1
                  records = []
          # check for remaining tweets
          if records:
            self._save_mini_chunk(records, chunk_count)

In [None]:
for i in range(4,9):
  parser = TweetsParser(f"{SNA_PROJECT_PATH}/TwiBot-22/tweet_{i}.json", i)
  parser.parse()

### Tweet chunks' dimension check

In [None]:
tweets_count = 0
for i in range(89):
  tweet_chunk_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/tweet_chunks/tweet_chunk_{i}.parquet")
  tweet_chunk_shape = tweet_chunk_df.shape
  print(f"Shape of file tweet_chunk_{i}: {tweet_chunk_shape}")
  tweets_count += tweet_chunk_shape[0]
print(f"# Tweet: {tweets_count}")

### Tweet first chunk visualization

In [None]:
tweet_chunk_0_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/tweet_chunks/tweet_chunk_0.parquet")

In [None]:
pd.set_option('display.max_colwidth', None)
tweet_chunk_0_df

## Parsing Edges

In [None]:
class EdgeParser():
    """
    This class parses a large edge CSV file, filters edges based on specified relations,
    and saves the filtered edges into smaller Parquet files for efficient processing.
    """
    def __init__(self, edges_path: str, relations, output_dir, chunk_size: int=500000):
      self.edges_path = edges_path
      self.relations = relations
      self.output_dir = output_dir
      self.chunk_size = chunk_size

    def __save_edges__(self, edges, chunk_count):
      os.makedirs(self.output_dir, exist_ok=True)
      df = pd.concat(edges, ignore_index=True)
      output_path = os.path.join(self.output_dir, f"edge_chunk_{chunk_count}.parquet")
      df.to_parquet(output_path, compression='snappy', index=False)
      print(f"Saved chunk {chunk_count} with {sum(len(df) for df in edges)} records to {output_path}")

    def parse(self):
      filtered_edges = []
      chunk_count = 0
      for chunk in pd.read_csv(self.edges_path, usecols=['source_id', 'relation', 'target_id'], chunksize=self.chunk_size):
        filtered_chunk = chunk[chunk["relation"].isin(self.relations)]
        filtered_edges.append(filtered_chunk)
        if sum(len(df) for df in filtered_edges) >= self.chunk_size:
          self.__save_edges__(filtered_edges, chunk_count)
          chunk_count += 1
          filtered_edges = []
      if len(filtered_edges) > 0:
          self.__save_edges__(filtered_edges, chunk_count)

In [None]:
edge_parser = EdgeParser(f"{SNA_PROJECT_PATH}/TwiBot-22/edge.csv", set(["followers", "following"]), f"{SNA_PROJECT_PATH}/edge_chunks")
edge_parser.parse()

### Edge chunks' dimension check

In [None]:
edges_count = 0
for i in range(8):
  edge_chunk_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/edge_chunks/edge_chunk_{i}.parquet")
  edge_chunk_shape = edge_chunk_df.shape
  print(f"Shape of file edge_chunk_{i}: {edge_chunk_shape}")
  edges_count += edge_chunk_shape[0]
print(f"# Edge (followers & following): {edges_count}")

### Edge last chunk visualization

In [None]:
edge_chunk_7_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/edge_chunks/edge_chunk_7.parquet")

In [None]:
edge_chunk_7_df