# Positive Count Grabber

In [32]:
SCHOOL_NAME = "UCF"
PIPELINE=[]
STRING = "China"
ARTICLE_JSON_PATH = 'ucf_article_pages.json'

csv_path = 'grouped_data/csv/'
csv_file = f'{csv_path}{SCHOOL_NAME}_{STRING}.csv'

In [25]:
import re
import json
import json
from helpers.queries import count_query
from helpers.processors import csv_to_positive_articles, schools_pipeline_query_to_csv

def generate_positive_csv():
    print("Generating positive CSV.")
    schools_pipeline_query_to_csv(school_names=[SCHOOL_NAME], 
                                pipeline=PIPELINE, 
                                query=count_query(STRING), 
                                csv_file=csv_file)
    df = csv_to_positive_articles(csv_file)
    return df

def make_links_field(df):
    print("Making links field.")
    with open(ARTICLE_JSON_PATH, 'r') as f:
        ucf_article_pages = json.load(f)

    df['links'] = df.apply(lambda row: ucf_article_pages[row['date'].replace('-','_')], axis=1)

def make_text_field(df):
    print("Making text field.")
    def split_text(school_name):
        if school_name == "UCF":
            def split_text_ucf(row):
                with open(row['txt_file'], 'r', encoding='utf-8') as f:
                    text = f.read()
                return re.split(r'\n{3,}', text)[:-1]
            return split_text_ucf
        else:
            raise ValueError(f"School {school_name} not yet supported.")

    df['text'] = df.apply(split_text(school_name=SCHOOL_NAME), axis=1)

def text_link_checker(df):
    print("Checking for inconsistencies between df['text'] and df['links'].")
    for index, row in df.iterrows():
        if len(row['text']) != len(row['links']):
            print(f"Row {index}: Length of 'text' does not match length of 'links'")
            print(row['text'], row['links'])

def make_matching_text_field(df):
    print("Making matching text field.")
    def matching_text(row):
        indices = [count_query(STRING)(text) for text in row['text']]
        new_text = [value for value, index in zip(row['text'], indices) if index != 0]
        return new_text
    df['matching_text'] = df.apply(matching_text, axis=1)

def make_matching_links_field(df):
    print("Making matching links field. May nisbehave if any incosistencies found prior.")
    def matching_links(row):
        indices = [count_query(STRING)(text) for text in row['text']]
        new_text = [value for value, index in zip(row['links'], indices) if index != 0]
        return new_text
    df['matching_links'] = df.apply(matching_links, axis=1)

def save_df(df):
    print("Saving df.")
    df.to_csv(f'{SCHOOL_NAME}_{STRING}.csv')

In [33]:
df = generate_positive_csv()
make_links_field(df)
make_text_field(df)
text_link_checker(df)
make_matching_links_field(df)
make_matching_text_field(df)
save_df(df)

Generating positive CSV.
grouped_data/csv/UCF_China.csv created.
Making links field.
Making text field.
Checking for inconsistencies between df['text'] and df['links'].
Row 665: Length of 'text' does not match length of 'links'
['READ FULL STORY ON JOE HORNSTEIN. \nWe obtained this information from the Associate Vice President of UCF.\nJoe Hornstein has been released from his duties as the Associate Athletics Director for Public Relations and Communications. We thank Joe for his dedicated service and many contributions to our program and wish him well in all future endeavors.\nUntil a replacement is hired and in place, Chad Binette (Associate Director, UCF News and Information) will serve as the Acting Athletics PR and Communications\nDirector — effective immediately and until further notice.\nChad has the university’s complete confidence in his new assignment.\nCheck back as we gather more details on this breaking news.', 'The FSU SGA elections scandal is headed to the university’s Su

Theoretical text processing, to be used in the case of pdf text.

In [2]:
# # Load in the data
# import nltk
# from helpers.processors import positive_articles_to_sentences
# from helpers.processors import preprocess_text
# import pandas as pd
# import numpy as np
# nltk.download('wordnet')

# txt_list = positive_articles_to_sentences(df=df, string=string)
# proc_txt_list = [preprocess_text(txt) for txt in txt_list]
# process_checker = pd.DataFrame([txt_list, proc_txt_list]).transpose()
# process_checker.columns = ["Raw", "Processed"]
# process_checker.to_csv("example_processed_text.csv", index=False)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adaml\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fetching all associated text: