# CS345 Final Project

### Project Imports

In [8]:
import os
from pathlib import Path
import pandas as pd
import json

### Load data

In [9]:
def load_news_dataset(dataset_name: str):
    base_path = os.getcwd() + '/data'
    dataset_path = Path(base_path) / dataset_name
    if not dataset_path.exists():
        raise FileNotFoundError(f"Dataset path not found: {dataset_path}")

    # load corpus.tsv
    corpus_path = dataset_path / "corpus.tsv"
    df = pd.read_csv(corpus_path, sep="\t")
    
    # load splits
    splits_path = dataset_path / "splits.json"
    splits = {}
    if splits_path.exists():
        with open(splits_path, "r") as f:
            splits = json.load(f)

    # load feature JSONs from features directory
    features_dir = dataset_path / "features"
    feature_data = {}
    if features_dir.exists():
        for json_file in features_dir.glob("*.json"):
            feature_name = json_file.stem
            with open(json_file, "r") as f:
                feature_data[feature_name] = json.load(f)
    
    return df, splits, feature_data

### ACL Dataset

In [12]:
acl_df, acl_splits, acl_features = load_news_dataset("acl2020")

acl_split_data = acl_splits.get('0', {})
acl_train_split_size = len(acl_split_data.get('train', []))
acl_dev_split_size = len(acl_split_data.get('dev', []))
acl_test_split_size = len(acl_split_data.get('test', []))

print("ACL2020 Dataset:")
print(f"Articles: {len(acl_df)}")
print(f"Columns: {list(acl_df.columns)}")
print(f"Available features: {list(acl_features.keys())}")
print(f"Split sizes: train={acl_train_split_size} dev={acl_dev_split_size} test={acl_test_split_size}")
print(acl_df.shape)

ACL2020 Dataset:
Articles: 859
Columns: ['source_url', 'source_url_normalized', 'ref', 'fact', 'bias']
Available features: ['youtube_subs', 'has_twitter', 'has_wikipedia', 'twitter_profile', 'youtube_numerical', 'youtube_fulltext', 'has_youtube', 'twitter_followers', 'has_facebook', 'articles_body_bert', 'youtube_nela', 'youtube_opensmile', 'articles_title_bert', 'wikipedia_content']
Split sizes: train=687 dev=0 test=172
(859, 5)


### EMNLP Dataset

In [11]:
emnlp_df, emnlp_splits, emnlp_features = load_news_dataset("emnlp18")

emnlp_split_data = emnlp_splits.get('0', {})
emnlp_train_split_size = len(emnlp_split_data.get('train', []))
emnlp_dev_split_size = len(emnlp_split_data.get('dev', []))
emnlp_test_split_size = len(emnlp_split_data.get('test', []))

print("EMNLP Dataset:")
print(f"Articles: {len(emnlp_df)}")
print(f"Columns: {list(emnlp_df.columns)}")
print(f"Available features: {list(emnlp_features.keys())}")
print(f"Split sizes: train={emnlp_train_split_size} dev={emnlp_dev_split_size} test={emnlp_test_split_size}")
print(emnlp_df.shape)

EMNLP Dataset:
Articles: 1066
Columns: ['source_url', 'source_url_normalized', 'ref', 'fact', 'bias']
Available features: ['twitter_urlmatch', 'has_twitter', 'wikipedia_toc', 'has_wikipedia', 'twitter_description', 'twitter_created_at', 'twitter_haslocation', 'articles_body_glove', 'twitter_engagement', 'twitter_verified', 'url_structure', 'articles_title_glove', 'alexa', 'wikipedia_categories', 'wikipedia_summary', 'wikipedia_content']
Split sizes: train=851 dev=0 test=215
(1066, 5)
