In [65]:
import collections
import spacy
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [66]:
nlp = spacy.load('en')

args = Namespace(
    raw_dataset_csv="data/articles_new.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/articles_with_splits.csv",
    seed=1337
)

In [72]:
# Read raw data ('fem' is articles with a female index of > 120)
articles = pd.read_csv(args.raw_dataset_csv, header=0)

In [73]:
articles['article_content'] = articles['content'].str.replace(
    '<div xmlns="http://www.w3.org/1999/xhtml">','')
articles['article_content'] = articles['content'].str.replace("<p>","")
articles['article_content'] = articles['content'].str.replace("</div>","")
articles['article_content'] = articles['content'].str.replace("&amp","")
articles['article_content'] = articles['content'].str.replace("."," ")

In [77]:
# article_lemmatized =  [token.lemma_ for token in nlp(p2.lower())]
len(set(articles['article_content']))


2730

In [78]:
# Splitting train by gender
# Create dict
by_gender = collections.defaultdict(list)
for _, row in articles.iterrows():
    by_gender[row.fem].append(row.to_dict())

In [79]:
# Create split data
final_list = []
np.random.seed(args.seed)
for _, item_list in sorted(by_gender.items()):
    np.random.shuffle(item_list)
    n = len(item_list)
    n_train = int(args.train_proportion*n)
    n_val = int(args.val_proportion*n)
    n_test = int(args.test_proportion*n)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'  
    
    # Add to final list
    final_list.extend(item_list)

In [80]:
final_articles = pd.DataFrame(final_list)

In [81]:
# Write processed data to CSV
final_articles.to_csv(args.output_munged_csv, index=False)