# Cross-reference Tweet IDs across Datasets



### Imports

In [2]:
import numpy as np
import pandas as pd
import os.path as osp
import os
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any


### Helper Functions

In [7]:
def find_root_dir() -> Path:
    """
    Find the root directory by locating pyproject.toml
    
    Returns:
        Path: The root directory
    
    Raises:
        FileNotFoundError: If pyproject.toml is not found in any parent directory
    """
    current: Path = Path.cwd()
    while not (current / "pyproject.toml").exists():
        if current.parent == current:
            raise FileNotFoundError("Could not find pyproject.toml in any parent directory")
        current = current.parent
    return current



### Constants

In [None]:
# Project Paths
ROOT_DIR = find_root_dir()            # Project Root
WORK_DIR = os.getcwd()                # Current Working Directory
NOTE_DIR = ROOT_DIR / "notebooks"     # Notebooks Directory
DATA_DIR = ROOT_DIR / "data"          # Data Directory

# Dataset Paths
KAGGLE = DATA_DIR / "COVID19_kaggle"          # Kaggle Dataset
MENDLEY = DATA_DIR / "COVID19_mendley"        # Mendley Dataset
OPENICPSR = DATA_DIR / "COVID19_openicpsr"    # OpenICPSR Dataset

# Display Paths
# print(f"ROOT_DIR: {ROOT_DIR}")
# print(f"WORK_DIR: {WORK_DIR}")
# print(f"NOTE_DIR: {NOTE_DIR}")
# print(f"DATA_DIR: {DATA_DIR}")
# print()
# print(f"KAGGLE: {KAGGLE}")
# print(f"MENDLEY: {MENDLEY}")
# print(f"OPENICPSR: {OPENICPSR}")


### Dataset Files


In [None]:
# Define the datasets

## Kaggle Dataset Filenames
kaggle_filenames: List[str] = [
    "Covid-19 Twitter Dataset (Apr-Jun 2020).csv",
    "Covid-19 Twitter Dataset (Apr-Jun 2021).csv",
    "Covid-19 Twitter Dataset (Aug-Sep 2020).csv",
]

## Mendley Dataset Filenames
mendley_filenames: List[str] = [
    "process_data_neg_0_14.csv",
    "process_data_pos_0_14.csv",
    "process_data_neg_1_13.csv",
    "process_data_pos_1_13.csv",
    "process_data_neg_2_12.csv",
    "process_data_pos_2_12.csv",
]

## OpenICPSR Dataset Filenames
openicpsr_filenames: List[str] = [
    "COVID19_twitter_full_dataset.csv",
    "tweetid_userid_keyword_sentiments_emotions.csv",
    "tweetid_userid_keyword_sentiments_emotions_United_States.csv",
    "vaccine_tweetid_userid_keyword_sentiments_emotions.csv",
]

## All Dataset Filenames
filenames: List[str] = kaggle_filenames + mendley_filenames + openicpsr_filenames

# Dataset Paths
kaggle_paths: List[Path] = [ KAGGLE / f for f in kaggle_filenames ]
mendley_paths: List[Path] = [ MENDLEY / f for f in mendley_filenames ]
openicpsr_paths: List[Path] = [ OPENICPSR / f for f in openicpsr_filenames ]
paths: List[Path] = kaggle_paths + mendley_paths + openicpsr_paths




dsets = [kaggle_dsets + mendley_dsets + openicpsr_dsets]
display(dsets)


[['data/COVID19_kaggle/Covid-19 Twitter Dataset (Apr-Jun 2020).csv',
  'data/COVID19_kaggle/Covid-19 Twitter Dataset (Apr-Jun 2021).csv',
  'data/COVID19_kaggle/Covid-19 Twitter Dataset (Aug-Sep 2020).csv',
  'data/mendley/process_data_0/process_data_neg_0_14.csv',
  'data/mendley/process_data_0/process_data_pos_0_14.csv',
  'data/mendley/process_data_1/process_data_neg_1_13.csv',
  'data/mendley/process_data_1/process_data_pos_1_13.csv',
  'data/mendley/process_data_2/process_data_neg_2_12.csv',
  'data/mendley/process_data_2/process_data_pos_2_12.csv',
  'data/openicpsr/full_dataset/COVID19_twitter_full_dataset.csv',
  'data/openicpsr/COVID19_openicpsr/ids_keywords_sentiments_emotions_covid/tweetid_userid_keyword_sentiments_emotions.csv',
  'data/openicpsr/COVID19_openicpsr/ids_keywords_sentiments_emotions_covid_united_states/tweetid_userid_keyword_sentiments_emotions_United_States.csv',
  'data/openicpsr/COVID19_openicpsr/ids_keywords_sentiments_emotions_covid_vaccine/vaccine_tweeti