In [1]:
import os
import pickle
import random

import numpy as np
import pandas as pd
import pytz
from rake_nltk import Rake
from tqdm.auto import tqdm

from IPython.display import clear_output

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
INPUT_DEDUP_FILES_BASE_PATH = (
    "/home/pranavgoel/trans-fer-entropy/obtaining_news_collections/data/"
)

In [3]:
def create_and_save_combined_dataframe(input_file_names):
    """
    Loads all the dedup files representing various media groups (the 6 states + nytimes+foxnews) and concatenates them as one dataframe, while adding a column for the media group.
    """
    all_dfs = []
    for f in input_file_names:
        df = pd.read_csv(INPUT_DEDUP_FILES_BASE_PATH + f)
        media_group = f.split("_article_")[0]
        df["media_group"] = [media_group] * len(df)
        all_dfs.append(df)
    combined_df = pd.concat(all_dfs)
    return combined_df

In [4]:
input_file_names = [
        x for x in os.listdir(INPUT_DEDUP_FILES_BASE_PATH) if "dedup.csv" in x
    ]

In [5]:
combined_df = create_and_save_combined_dataframe(input_file_names)
print(combined_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15883 entries, 0 to 666
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   media_name    15883 non-null  object
 1   publish_date  15883 non-null  object
 2   title         15883 non-null  object
 3   url           15883 non-null  object
 4   subtitle      13214 non-null  object
 5   text          15883 non-null  object
 6   sent_count    15883 non-null  int64 
 7   media_group   15883 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.1+ MB
None


In [6]:
titles_and_texts = list(zip(combined_df["title"], 
                            combined_df["text"]))
titles_and_texts = list(map(lambda x: x[0] + " " + x[1], titles_and_texts))

### Obtain all texts with `trans` present in them, but not `transgender`

In [7]:
trans_only_texts = [x.lower() for x in tqdm(titles_and_texts) if 'trans' in x.lower() and 'transgender' not in x.lower()]
print(len(trans_only_texts))

100%|██████████| 15883/15883 [00:02<00:00, 7329.45it/s]

4477





In [9]:
N = len(titles_and_texts)
print(len(trans_only_texts)/N)

0.28187370144179313


### Create a random sample of 100, then manually label them as 1/0 for relevance (talking about transgender people and/or issues)

In [24]:
sample_texts_for_annotation = random.Random(42).sample(trans_only_texts, 100)

#### Below we label in notebook itself -- basically labeling if the term trans has been used to mean transgender (1) or something else (0)

In [38]:
labels = []
for text in sample_texts_for_annotation:
    show_text = text.replace('trans', 'TRANS') #to highlight and focus attention when labeling
    print(show_text, flush=True)
    label = int(input())
    labels.append(label)
    if len(labels) >= 100:
        print('DONE')
        break
    #print('\n---\n')
    
    clear_output(wait=True) #this will clear the console of previous outputs and show current text to be labeled

man’s anti-woke temper tantrum against jack daniel’s goes viral a new jersey man’s beef with jack daniel’s tennessee whiskey turning too “woke” has gone viral after he posted a video of himself torching a comically massive load of merchandise and booze to protest the brand’s support for the lgbtq+ community.

in the early morning of april 7, pauly michaelis posted a video of the defiant burn session to facebook.

“for those of you who know me well my drink has always been jack daniels,” he wrote. “for over 150 years since 1866 this drink was always associated with cowboys, warriors, bikers, savage rock bands and all american badass people.”

“they went woke,” he said with the middle finger emoji.

“jasper newton is turning over in his grave. they took a classic tradition of americana that was the total [definition] of masculinity and made it woke and for this i say fuck you jd and all of your products,” michaelis wrote. “you will never have the honor of touching my lips ever again. #ja

KeyboardInterrupt: Interrupted by user

In [39]:
print(len(labels))

13


In [40]:
#resuming below since some notebook display thing above hung up
for text in sample_texts_for_annotation[13:]:
    show_text = text.replace('trans', 'TRANS') #to highlight and focus attention when labeling
    print(show_text, flush=True)
    label = int(input())
    labels.append(label)
    if len(labels) >= 100:
        print('DONE')
        break
    #print('\n---\n')
    
    clear_output(wait=True) #this will clear the console of previous outputs and show current text to be labeled

brisket & rice is twice as nice “live-fire cooking next to a gas station: who would have thought?” brisket & rice co-owner hong tran asked. “only in texas,” he added with a laugh. it’s in this unlikely location in northwest houston that he; his wife, michelle; and his brother phong opened their barbecue joint just over a year ago.

the space inside the phillips 66 had been a church’s chicken and a chinese restaurant. tran, who got a cup of coffee at the gas station every morning on the way to work, realized it was the perfect place to park his pair of five-hundred-gallon smokers mounted on a trailer. “it was sitting in front of my face the whole time,” he said. even better, the location was just outside the city of houston’s health department jurisdiction, making it easier to get a permit. harris county only required an enclosed pit room in which to park them. “it’s ridiculous how lucky i got,” tran said.

the TRANS closed the drive-through to build the pit room, thankful their $20,000

KeyboardInterrupt: Interrupted by user

In [41]:
print(len(labels))

24


In [42]:
#resuming below since some notebook display thing above hung up
for text in sample_texts_for_annotation[24:]:
    show_text = text.replace('trans', 'TRANS') #to highlight and focus attention when labeling
    print(show_text, flush=True)
    label = int(input())
    labels.append(label)
    if len(labels) >= 100:
        print('DONE')
        break
    #print('\n---\n')
    
    clear_output(wait=True) #this will clear the console of previous outputs and show current text to be labeled

the braided life provides an integral gathering space for marginalized communities the building blocks for milly fotso’s community-centric east austin salon, the braided life, were tied to memories of her cameroonian roots—or, as she calls it, her “hairitage.” at age 5, her family relocated from the central african country to manhattan, kansas, where she became acutely aware of a sense of “otherness.”

her mom sought to become a safe harbor for her daughter, as well as help make ends meet, by braiding hair professionally out of their home. two years later, as fotso coped with the devastating death of her mother, her father stepped in to learn. driving her to a braiding studio in nearby junction city, which housed a larger african population, he sat in a space occupied only by women to observe. while embarrassed at the time, she now recalls the memory as a bonding moment.

the story of black hair weaves together themes like self-expression, social status, oppression, and community ident

KeyboardInterrupt: Interrupted by user

In [43]:
print(len(labels))

46


In [44]:
#resuming below since some notebook display thing above hung up
for text in sample_texts_for_annotation[46:]:
    show_text = text.replace('trans', 'TRANS') #to highlight and focus attention when labeling
    print(show_text, flush=True)
    label = int(input())
    labels.append(label)
    if len(labels) >= 100:
        print('DONE')
        break
    #print('\n---\n')
    
    clear_output(wait=True) #this will clear the console of previous outputs and show current text to be labeled

desantis and musk team up to 'make america florida' on wednesday evening, florida gov. ron desantis will make official what was already blindingly obvious: he’s running for president in 2024. but rather than putting up his own campaign video or stepping in front of the cameras on fox news, desantis will first go to a place where he feels even more comfortable: twitter. there he will enter the race in “conversation” with ceo elon musk.

then he’ll go to fox news, which is still willing to host his non-announcement despite being spurned in favor of musk because fox has made promoting desantis into one of their chief reasons to exist. sometime in the future, desantis will reportedly make a more standard announcement, complete with a video pushing his “make america florida” tagline (no, seriously, that’s his campaign slogan). but for now, desantis is very appropriately starting with a stunt announcement for a campaign that has totally eschewed the idea of government as a force for good.

f

KeyboardInterrupt: Interrupted by user

In [45]:
print(len(labels))

54


In [46]:
#resuming below since some notebook display thing above hung up
for text in sample_texts_for_annotation[54:]:
    show_text = text.replace('trans', 'TRANS') #to highlight and focus attention when labeling
    print(show_text, flush=True)
    label = int(input())
    labels.append(label)
    if len(labels) >= 100:
        print('DONE')
        break
    #print('\n---\n')
    
    clear_output(wait=True) #this will clear the console of previous outputs and show current text to be labeled

bentley bentayga airline spec is your private jet on wheels the bentayga offers a wide variety of luxury and performance. from the sportier w12-equipped speed model to the plug-in hybrid, there's a bentayga for your sophisticated motoring needs.

after nearly a week rolling around town in the 2023 bentley bentayga, enjoying its power, comfort and the ability to make a grand entrance basically anywhere, it dawns on me that i’m doing this wrong. yes, the v8 is great, but of course it is. the story is in the back seat.

i’m testing an extended wheelbase bentayga azure first edition outfitted with the airline seat specification. it’s business class, plain and simple. as darkness falls, i make up a cheese plate and head to the backseat, placing my charcuterie on the veneered picnic trays below the 10.1-inch screens. leaning back on the pillowy headrest, i feel like i’m about to settle in for a cushy TRANS-atlantic flight.

there’s so much room. the wheelbase extends 7 inches beyond the stan

KeyboardInterrupt: Interrupted by user

In [47]:
print(len(labels))

85


In [48]:
#resuming below since some notebook display thing above hung up
for text in sample_texts_for_annotation[85:]:
    show_text = text.replace('trans', 'TRANS') #to highlight and focus attention when labeling
    print(show_text, flush=True)
    label = int(input())
    labels.append(label)
    if len(labels) >= 100:
        print('DONE')
        break
    #print('\n---\n')
    
    clear_output(wait=True) #this will clear the console of previous outputs and show current text to be labeled

the end times for the officious figureheads commentary

later in this piece i discuss the hbo series “succession.” if you are allergic to anything like a very mild spoiler—i don’t think what i’m going to say spoils anything really—you should not read this. it’s about real and fake in the workplace and how to deal with the difference.

tell me if this has ever happened to you. you have a boss—a person perhaps one ladder rung ahead of you but not really in charge of the enterprise itself—and he is an entitled jerk, sort of like the legendary one in “office space.”

it’s not clear how he got there but there he is. you figure he must have worked his way up, knows the business, has a range of skills, earned the respect of the owners or board or ceo, and eventually was widely trusted.

but then you notice over time that something is off. he is always giving orders, strutting around, acting the part. but in quiet ways, he keeps handing out favors to friends and friends of friends. he is also 

 1


DONE


In [49]:
print(len(labels))

100


In [50]:
print(labels.count(1))

61


## Result: For 61/100 of articles that have 'trans' but not 'transgender', the term trans was used to mean transgender. We cannot remove all texts with 'trans' but no 'transgender' -- too many cases of relevant 'trans' including terms like transphobic or transphobia etc. without mentioning transgender -- surely don't wanna remove those! 

### Do we just live with the data, acknowledge this drawback and only analyze topics (including topic labels provided by humans) that are transgender-related?