# Positive Count Grabber

In [34]:
# Data to set #####################################
SCHOOL_NAME = "York"
PIPELINE=[]
STRINGS = ["China", "India", "Israel", "Palestine"]
###################################################

ARTICLE_JSON_PATH = f'{SCHOOL_NAME.lower()}_article_pages.json'

csv_path = 'grouped_data/csv/'
def get_csv_file(string):
    return f'{csv_path}{SCHOOL_NAME}_{string}.csv'

In [49]:
import re
import json
import json
from helpers.queries import count_query
from helpers.processors import csv_to_positive_articles, schools_pipeline_query_to_csv

def runner(school_name,
           pipeline,
           string,
           csv_file,
           article_json_path):
    def generate_positive_csv():
        schools_pipeline_query_to_csv(school_names=[school_name], 
                                    pipeline=pipeline, 
                                    query=count_query(string), 
                                    csv_file=csv_file)
        df = csv_to_positive_articles(csv_file)
        return df

    def make_links_field(df):
        with open(article_json_path, 'r') as f:
            ucf_article_pages = json.load(f)

        df['links'] = df.apply(lambda row: ucf_article_pages[row['date'].replace('-','_')], axis=1)

    def make_text_field(df):
        def split_text(school_name):
            if school_name == "UCF":
                def split_text_ucf(row):
                    with open(row['txt_file'], 'r', encoding='utf-8') as f:
                        text = f.read()
                    return re.split(r'\n{3,}', text)[:-1]
                return split_text_ucf
            elif school_name == 'York':
                def split_text_york(row):
                    with open(row['txt_file'], 'r', encoding='utf-8') as f:
                        text = f.read()
                    texts = re.split(r'\n{3,}', text)
                    if len(texts) > 1:
                        texts = texts[:-1]
                    return texts
                return split_text_york
            else:
                raise ValueError(f"School {school_name} not yet supported.")

        df['text'] = df.apply(split_text(school_name=school_name), axis=1)

    def text_link_checker(df):
        for index, row in df.iterrows():
            if len(row['text']) != len(row['links']):
                print(f"Date {row['date']}: Length of 'text' is {len(row['text'])}, while length of 'links' is {len(row['links'])}")

    def make_matching_text_field(df):
        def matching_text(row):
            indices = [count_query(string)(text) for text in row['text']]
            new_text = [value for value, index in zip(row['text'], indices) if index != 0]
            return new_text
        df['matching_text'] = df.apply(matching_text, axis=1)

    def make_matching_links_field(df):
        def matching_links(row):
            indices = [count_query(string)(text) for text in row['text']]
            new_text = [value for value, index in zip(row['links'], indices) if index != 0]
            return new_text
        df['matching_links'] = df.apply(matching_links, axis=1)

    def save_df(df):
        df.to_csv(f'{school_name}_{string}.csv')
        
    df = generate_positive_csv()
    make_links_field(df)
    make_text_field(df)
    text_link_checker(df)
    make_matching_links_field(df)
    make_matching_text_field(df)
    save_df(df)
    print(f'Finished working with {SCHOOL_NAME} for {string}.')

In [50]:
for STRING in STRINGS:
       runner(school_name=SCHOOL_NAME,
              pipeline=PIPELINE,
              string=STRING,
              csv_file=get_csv_file(STRING),
              article_json_path=ARTICLE_JSON_PATH)

grouped_data/csv/York_China.csv already exists.
Date 2011-04-18: Length of 'text' is 2, while length of 'links' is 1
Date 2011-09-28: Length of 'text' is 11, while length of 'links' is 9
Date 2020-10-29: Length of 'text' is 1, while length of 'links' is 2
Finished working with York for China.
grouped_data/csv/York_India.csv already exists.
Date 2014-08-27: Length of 'text' is 24, while length of 'links' is 10
Date 2020-10-30: Length of 'text' is 1, while length of 'links' is 2
Date 2021-01-29: Length of 'text' is 1, while length of 'links' is 2
Date 2021-02-25: Length of 'text' is 1, while length of 'links' is 2
Date 2023-01-30: Length of 'text' is 3, while length of 'links' is 4
Finished working with York for India.
grouped_data/csv/York_Israel.csv already exists.
Date 2010-10-27: Length of 'text' is 13, while length of 'links' is 10
Date 2011-01-26: Length of 'text' is 10, while length of 'links' is 9
Date 2011-02-17: Length of 'text' is 6, while length of 'links' is 4
Date 2013-12-0

Theoretical text processing, to be used in the case of pdf text.

In [2]:
# # Load in the data
# import nltk
# from helpers.processors import positive_articles_to_sentences
# from helpers.processors import preprocess_text
# import pandas as pd
# import numpy as np
# nltk.download('wordnet')

# txt_list = positive_articles_to_sentences(df=df, string=string)
# proc_txt_list = [preprocess_text(txt) for txt in txt_list]
# process_checker = pd.DataFrame([txt_list, proc_txt_list]).transpose()
# process_checker.columns = ["Raw", "Processed"]
# process_checker.to_csv("example_processed_text.csv", index=False)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adaml\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fetching all associated text: