# Mounting the Google Drive

First, mount the google drive.

In [None]:
from google.colab import drive

drive.mount('/content/drive')

# Build dataset



We intend to extract datasets of various mainstream media like New York Times and Fox News of different categories from the whole dataset provided. The following code is to generate the datasets.

In [None]:
# We try to split the dataset of one media into seven topics.
# The map maps the categories to the keywords. 
# We try to reuse the categorizing result of the original website and investigate the patterns of the url of different media websites. 
# If the url of the quotation contains those keywords, we assume that the quotation belongs to the categories.

categories = {
'politics': ['/politics/', '/us/', '/world/', '/business/', '/opinion/', '/economy/', '/finance/', '/market/'],
'technology': ['/tech/', '/science/'],
'health': ['/health/'],
'sports': ['/sports/'],
'arts': ['/art/', '/arts/', '/movie/', '/fashion/', '/book/', '/books/' '/style/', '/music/', '/entertainment/'],
'lifestyle': ['/food/', '/travel/', '/lifestyle/', '/auto/'],
'uncategorized': ['/']
}

In [None]:
# Import the packages we need.

import bz2
import json
import re
import os
import string
import numpy as np
import pandas as pd

# Prepare the necessary path data and file handlers.

years = ['2015', '2016', '2017', '2018', '2019', '2020']
path_to_file_list = ['/content/drive/MyDrive/Quotebank/quotes-' + year + '.json.bz2' for year in years]
path_to_out_list = ['/content/drive/Shareddrives/ADA/foxnews/'+year+'/quotes-' + year + '-fox-all.json.bz2' for year in years]
gender_file_list = ['/content/drive/MyDrive/quotes-' + year + '-fox-gender.json.bz2' for year in years]

maps = []
for year in years:
  cat_to_file = {}
  for cat in categories:
    path = '/content/drive/MyDrive/quotes-' + year + '-fox-' + cat + '.json.bz2'
    cat_to_file[cat] = bz2.open(path, 'wb')
  maps.append(cat_to_file)


In [None]:
# Select the quotations according the prementioned rules and write them to new files.

for i, year in enumerate(years):
  path_to_file = path_to_file_list[i]
  path_to_out = path_to_out_list[i]
  with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
      for instance in s_file: 
        instance = json.loads(instance) # loading a sample
        urls = instance['urls'] # extracting list of links
        is_media = False
        category = None
        for url in urls:
          o = urlparse(url)
          if 'foxnews' in o.netloc:
            for cat in categories:
              for keyword in categories[cat]:
                if keyword in o.path:
                  category = cat
                  is_media = True
                  break
              if is_media:
                break
            if is_media:
              break
        if not is_media:
          continue
    
        instance['category'] = category
        d_file.write((json.dumps(instance)+'\n').encode('utf-8')) 

  with bz2.open(path_to_out, 'rb') as s_file:
    for instance in s_file:
      instance = json.loads(instance)
      cat = instance['category']
      maps[i][cat].write((json.dumps(instance)+'\n').encode('utf-8')) 


The output dataset of Foxnews in 2015:

![WeChat Image_20211111203434](https://user-images.githubusercontent.com/34649843/141358651-64ff392f-a3c6-446f-8282-43053d1855fe.png)

![WeChat Image_20211111203500](https://user-images.githubusercontent.com/34649843/141358659-70f15cfe-f79e-4c6e-af1e-57e9cc712d57.png)

The datasets of other medias and other years are organized similarly.

We also want to extract the gender related quotations from the dataset. We prepared a gender related word list. If a quotation or its url contains some words in the list, we assume it to be gender related.

In [None]:
# Load the gender word list.

with open('/content/drive/Shareddrives/ADA/gender related words.txt', 'r') as infile:
  words = infile.readlines()

gender_word_set = set([w.strip() for w in words])

In [None]:
# Preprocess the text.

def clean_text(text):
    # To remove the punctuations
    text = text.translate(str.maketrans(' ',' ',string.punctuation))
    # To consider only alphabets and numerics
    text = re.sub('[^a-zA-Z]',' ',text) 
    # To replace newline with space
    text = re.sub("\n"," ",text)
    # To convert to lower case
    text = text.lower()

    return text.split()

In [None]:
# Select the quotations according the prementioned rules and write them to new files.

for i, year in enumerate(years):
  path_to_file = path_to_out_list[i]
  path_to_out = gender_file_list[i]
  with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
      for instance in s_file: 
        instance = json.loads(instance)
        urls = instance['urls']
        quotation = instance['quotation']

        is_gender_related = False
        
        for url in urls:
          o = urlparse(url)
          path = clean_text(o.path)
          for w in path:
            if w in gender_word_set:
              is_gender_related = True

          if is_gender_related:
            break

        if not is_gender_related:
          quotation = clean_text(quotation)
          for w in quotation:
            if w in gender_word_set:
              is_gender_related = True
              break

        if is_gender_related:
          d_file.write((json.dumps(instance)+'\n').encode('utf-8')) 

The output dataset of foxnews is organized as follows:

![WeChat Image_20211111204329](https://user-images.githubusercontent.com/34649843/141359086-7332ac51-6e50-4b85-a7d1-c090be8b3617.png)

The datasets of other medias are organized similarly.