In [1]:
import pandas as pd

## Arguments processing 

In [2]:
args_json = pd.read_json("../crawlers/procon/procon.json")

def format_args_row(row):
    df = row.apply(pd.Series).T
    df.url = df.url[0]
    df.topic = df.topic[0]
    return(df)

arg_df_series = args_json.apply(format_args_row, axis=1)
args_df = pd.concat(arg_df_series.values.tolist(),ignore_index=True)



In [3]:
def orientation(pro_con):
    if "Pro" in pro_con:
        return "Pro"
    return "Con"

args_df = args_df.assign(argument_orient = args_df.apply(lambda row: orientation(row.pro_con), axis=1))

## Documents processing

#### Convert from json to dataframe and add the topic and the source stance information 

In [4]:
import glob
import json

def create_df(path_list, topic, source_stance):
    glob_data = []
    for path in path_list:
        for file in glob.glob(path):
            with open(file) as json_file:
                data = json.load(json_file)
                glob_data.append(data)

        df = pd.DataFrame(glob_data)
        df['text_topic'] = topic
        df['text_source_stance'] = source_stance
    return(df)

In [5]:
path_list = ["../../../news-please-repo/data/2020/02/06/abort73.com/*.json"]
topic = "Abortion"
source_stance = "Con"

abort73 = create_df(path_list, topic, source_stance)

In [6]:
path_list = ["../../../news-please-repo/data/2020/02/06/mpp.org/*.json","../../../news-please-repo/data/2020/02/06/blog.mpp.org/*.json"]
topic = "Marijuana"
source_stance = "Pro"

mpp = create_df(path_list, topic, source_stance)

In [7]:
path_list = ["../../../news-please-repo/data/2020/02/06/prochoiceamerica.org/*.json"]
topic = "Abortion"
source_stance = "Pro"

prochoice = create_df(path_list, topic, source_stance)

In [8]:
path_list = ["../../../news-please-repo/data/2020/02/06/calmca.org/*.json"]
topic = "Marijuana"
source_stance = "Con"

calmca = create_df(path_list, topic, source_stance)

In [9]:
document_df = pd.concat([abort73, prochoice, mpp, calmca], ignore_index=True)

#### Filter the columns of dataframes to use 

In [10]:
columns = ['authors', 'date_modify', 'date_publish', 'title', 'source_domain', 'text', 'url', 'text_topic', 'text_source_stance']
document_df = pd.DataFrame(document_df, columns=columns)
document_df = document_df.rename(columns={"authors":"text_authors", "title":"text_title", "source_domain":"text_source_domain", "url":"text_url"})

#### Merge documents and arguments by topic 

In [11]:
args_docs = args_df.merge(document_df, left_on=['topic'], right_on=['text_topic'])

In [12]:
args_docs.to_csv("args_docs.csv", index_label=False)

## Paragraphs

In [13]:
wiki_json = pd.read_json("../crawlers/procon/wiki.json")

In [14]:
def format_wiki_row(row):
    df = row.apply(pd.Series).T
    df.source = df.source[0]
    df.title = df.title[0]
    df.topic = df.topic[0]
    df.stance = df.stance[0]
    df.url = df.url[0]
    return(df)

wiki_df_series = wiki_json.apply(format_wiki_row, axis=1)
wiki_df = pd.concat(wiki_df_series.values.tolist(),ignore_index=True)
wiki_df = wiki_df.rename(columns={"topic":"text_topic", "title":"text_title", "content":"text", "source":"text_source_domain", "stance":"text_source_stance", "url":"text_url"})

In [15]:
args_prg = args_df.merge(wiki_df, left_on=['topic'], right_on=['text_topic'])

In [16]:
args_prg.to_csv("args_prg.csv", index_label=False)