## **Import Libraries**

In [1]:
import pandas as pd # type: ignore
pd.set_option('max_colwidth', None)

In [297]:
import cohere
from cohere import ClassifyExample

In [2]:
from getpass import getpass
import os
co_api_key = getpass("Enter Cohere API key: ")
os.environ['COHERE_API_KEY'] = co_api_key
cohere_client = cohere.Client(api_key=co_api_key)

In [6]:
os.listdir('Data/Clean_CSV/')

['ClimateOffensive_comments_clean.csv',
 'climatechange_comments_clean.csv',
 'sustainability_comments_clean.csv',
 'roblox_comments_clean.csv',
 'ClimateOffensive_submissions_clean.csv',
 'climatechange_submissions_clean.csv',
 'Anxietyhelp_submissions_clean.csv',
 'sustainability_submissions_clean.csv',
 'Anxietyhelp_comments_clean.csv',
 'roblox_submissions_clean.csv']

## **Dataset**

In [18]:
df_co = pd.read_csv('Data/Clean_CSV/ClimateOffensive_submissions_clean.csv')
df_cc = pd.read_csv('Data/Clean_CSV/climatechange_submissions_clean.csv')
df_st = pd.read_csv('Data/Clean_CSV/sustainability_submissions_clean.csv')
df_ax = pd.read_csv('Data/Clean_CSV/Anxietyhelp_submissions_clean.csv')
df_rb = pd.read_csv('Data/Clean_CSV/roblox_submissions_clean.csv')

#### **Anxiety Help**

In [336]:
df_ax.sample(n=1)

Unnamed: 0,subreddit,created_datetime,year,score,title,selftext,link_flair_text,num_comments,num_crossposts
33925,Anxietyhelp,2022-01-11 18:34:59,2022,3,The Healing Effects of Jin Shin Jyutsu,,Article,0,0.0


In [29]:
df_ax['link_flair_text'].value_counts()

link_flair_text
Need Advice              6449
Need Help                5105
Question                 1025
Video                    1020
Self Help Strategy        878
Personal Experience       748
Discussion                716
Giving Advice             706
Music                     515
Article                   353
Personal Achievement!     163
Research Study            123
Poll / Survey              55
Name: count, dtype: int64

In [343]:
df_ax_non_youth = df_ax[(df_ax['link_flair_text']=='Article') | (df_ax['link_flair_text']=='Research Study')]

df_ax_non_youth = df_ax_non_youth[df_ax_non_youth['selftext']!='[deleted]']
df_ax_non_youth = df_ax_non_youth[df_ax_non_youth['selftext']!='[removed]']
df_ax_non_youth.dropna(inplace=True)

print(len(df_ax_non_youth))

93


#### **Sustainability**

In [26]:
df_st.head(n=1)

Unnamed: 0,subreddit,created_datetime,year,score,title,selftext,link_flair_text,num_comments,num_crossposts
0,sustainability,2008-07-13 08:17:41,2008,1,G8 goes electric with Subaru,,,0,0.0


In [27]:
df_st['link_flair_text'].value_counts()

link_flair_text
Misleading                         8
Misleading: it WILL, not it HAS    1
Name: count, dtype: int64

#### **Climate Change**

In [24]:
df_cc.head(n=1)

Unnamed: 0,subreddit,created_datetime,year,score,title,selftext,link_flair_text,num_comments,num_crossposts
0,climatechange,2010-01-04 01:45:11,2010,0,Obama Misses The Green,,,0,0.0


In [25]:
df_cc['link_flair_text'].value_counts()

Series([], Name: count, dtype: int64)

#### **Climate Offensive**

In [19]:
df_co.head(n=1)

Unnamed: 0,subreddit,created_datetime,year,score,title,selftext,link_flair_text,num_comments,num_crossposts
0,ClimateOffensive,2018-10-29 05:03:40,2018,39,Discussion,Let's brainstorm and recruit. A lot of people ...,,71,0


In [20]:
df_co['link_flair_text'].value_counts()

link_flair_text
News                               1076
Discussion/Question                 985
Sustainability Tips &amp; Tools     611
Action - Other                      565
Idea                                548
Action - Political                  509
Question                            476
Action - USA üá∫üá∏                     475
Action - Event                      388
Action - International üåç            279
Motivation Monday                   238
Action - Volunteering               215
Action - Petition                   215
Action                              163
Action - Fundraiser                 159
Action - Share                      142
Climate Politics                    129
Climate News                        128
Discussion                           99
Action - Europe üá™üá∫                   64
Action - United Kingdom üá¨üáß           62
Community Update                     58
Action - Australia üá¶üá∫                43
Technology                           

In [341]:
df_co_non_youth = df_co[(df_co['link_flair_text']=='News') |((df_co['link_flair_text']=='Climate News'))]

df_co_non_youth = df_co_non_youth[df_co_non_youth['selftext']!='[deleted]']
df_co_non_youth = df_co_non_youth[df_co_non_youth['selftext']!='[removed]']
df_co_non_youth.dropna(inplace=True)

print(len(df_co_non_youth))

43


## **Making Labeled Dataset**

#### **Not Youth**

In [344]:
non_youth = pd.concat([df_co_non_youth,df_ax_non_youth], ignore_index=True)
print(len(non_youth))

136


In [352]:
non_youth.sample(n=1)

Unnamed: 0,subreddit,created_datetime,year,score,title,selftext,link_flair_text,num_comments,num_crossposts
27,ClimateOffensive,2019-08-12 19:26:17,2019,14,"Air Pollution is a Bigger Problem than You Think, but Solutions are Coming","Air pollution is responsible for the deaths of around 7 million people each year, causes a myriad of long-term health issues, hinders brain development in children, and costs governments around the world roughly $5 trillion per year, easily making it one of the biggest problems for humanity.\n\n[http://thehappyneuron.com/2019/08/12/air-pollution-is-a-bigger-problem-than-you-think-but-solutions-are-coming/](http://thehappyneuron.com/2019/08/12/air-pollution-is-a-bigger-problem-than-you-think-but-solutions-are-coming/)",News,0,0.0


#### **Youth**

In [177]:
df_rb.head(n=1)

Unnamed: 0,subreddit,created_datetime,year,score,title,selftext,link_flair_text,num_comments,num_crossposts
0,roblox,2010-04-10 03:57:55,2010,0,Free Games at ROBLOX.com,[deleted],,0,0.0


In [179]:
df_rb.isnull().sum()

subreddit                0
created_datetime         0
year                     0
score                    0
title                    0
selftext            171473
link_flair_text      60680
num_comments             0
num_crossposts       38732
dtype: int64

In [180]:
df_rb = df_rb[df_rb['selftext']!='[deleted]']
df_rb = df_rb[df_rb['selftext']!='[removed]']
df_rb = df_rb.dropna(subset=['selftext'])

In [244]:
Youth = df_rb.sample(n=150).reset_index(drop=True)

In [353]:
Youth.sample(n=1)

Unnamed: 0,subreddit,created_datetime,year,score,title,selftext,link_flair_text,num_comments,num_crossposts
123,roblox,2020-07-13 20:23:30,2020,3,Amateur creative designer,10 robux and I'll work on a fame design send me your user name and if you have a group I would like you to send me it\n\n\nKeep in mind I'm not very good but the more times I get hired the more experienced I get,Discussion,11,0.0


#### **Labeled Dataset**

In [246]:
examples = []
for i in range(len(Youth)): # type: ignore
    youth_text = Youth.loc[i,'title']+', '+Youth.loc[i,'selftext']
    examples.append(ClassifyExample(text=youth_text, label='youth'))
for i in range(len(non_youth)): # type: ignore
    non_youth_text = non_youth.loc[i,'title']+', '+non_youth.loc[i,'selftext']
    examples.append(ClassifyExample(text=non_youth_text, label='not youth'))

## **Classify Comments from Sustainability**

In [252]:
st_comments = pd.read_csv('Data/Clean_CSV/sustainability_comments_clean.csv')

st_comments = st_comments[st_comments['body']!='[deleted]']
st_comments = st_comments[st_comments['body']!='[removed]']
st_comments.reset_index(inplace=True, drop=True)

df_test_st = st_comments.sample(n=95).reset_index(drop=True)

inputs = []
for i in range(len(df_test_st)):
    inputs.append(df_test_st.loc[i,'body']) 

response = cohere_client.classify(
    inputs=inputs,
    examples=examples,
)

predictions = [item.prediction for item in response.classifications]
df_test_st['prediction'] = predictions

confidence = [round(item.confidence,2) for item in response.classifications]
df_test_st['confidence'] = confidence

df_test_st.to_csv('Data/Labeled/Sustainability_Labeled.csv',index=False)

In [253]:
df_test_st.sample(n=5)

Unnamed: 0,subreddit,created_datetime,year,score,body,prediction,confidence
41,sustainability,2021-02-04 21:37:43,2021,2,Now I can rest easy about micro plastics. I love Reddit!,not youth,1.0
35,sustainability,2022-11-02 01:40:33,2022,1,"Not that long ago, people could not imagine a horse free transportation system. We will be car free when the thing that makes sense shows up. Until then we are just doing a square peg round hole exercise.",not youth,1.0
7,sustainability,2019-12-04 13:35:55,2019,3,That is so sad :(,youth,0.98
73,sustainability,2022-11-11 00:12:40,2022,10,"Yes, that‚Äôs well known. If you take marketing courses, this is frequently mentioned. The purpose of marketing is to change peoples‚Äô spending behavior, usually in favor of a brand or product. That does not require an ounce of truth. \n\nRegulations prohibit marketing campaigns from lying (eg ‚Äúcontains calcium‚Äù with none present) or making unfounded medical claims (eg ‚Äúcures cancer‚Äù). However these rules are limited to proveable claims, or regulated words / phrases (eg trademark). \n\nThat leaves A LOT of room to abuse language and make misleading, vague claims. In short, you shouldn‚Äôt take *any* marketing seriously, *especially* if the claims appeal to you. \n\nSource: studied marketing, communication, and social media for my job in Sustainability for large organizations (non-profit). I asked every one of my professors if the marketing profession had a ‚Äúcode of ethics‚Äù and they all laughed.\n\nEdit: proveable claims such as ‚Äú99% recycled material‚Äù or similar are usually fairly good. They will stretch the math to make that claim, but they‚Äôre also on the hook if someone sues them for false advertising. However that doesn‚Äôt make all number-based claims reliable. It‚Äôs common practice for companies to reduce the quantity of a product by say 40% and a short time later increase it back up. They‚Äôll proudly put ‚Äúnow with 30% more‚Äù on the box, even though it is less than the original quantity. Naturally the price has stayed the same throughout.",not youth,1.0
91,sustainability,2021-02-24 15:32:35,2021,11,"The picture is an example of filtering, not overfishing. Wtf",youth,0.74


## **Classify Comments from Climate Change**

In [254]:
cc_comments = pd.read_csv('Data/Clean_CSV/climatechange_comments_clean.csv')

cc_comments = cc_comments[(cc_comments['body']!='[deleted]')]
cc_comments = cc_comments[(cc_comments['body']!='[removed]')]
cc_comments.reset_index(inplace=True, drop=True)

df_test_cc = cc_comments.sample(n=95).reset_index(drop=True)

inputs = []
for i in range(len(df_test_cc)):
    inputs.append(df_test_cc.loc[i,'body']) 

response = cohere_client.classify(
    inputs=inputs,
    examples=examples,
)

predictions = [item.prediction for item in response.classifications]
df_test_cc['prediction'] = predictions

confidence = [round(item.confidence,2) for item in response.classifications]
df_test_cc['confidence'] = confidence

df_test_cc.to_csv('Data/Labeled/ClimateChange_Labeled.csv',index=False)

In [290]:
df_test_cc.sample(n=3)

Unnamed: 0,subreddit,created_datetime,year,score,body,prediction,confidence
75,climatechange,2021-11-12 01:36:19,2021.0,1,in what possible way ?,youth,0.63
57,climatechange,2022-10-05 17:24:06,2022.0,0,"Billions. You can run from a tide gone wild, a tsunami warning can protect many, there are very strong constructions some communities can built that can last and protect for centuries. But running from everything else, pandemics of any origin for instance, or hunger, and collapsing eco systems, is going to be impossible for so incredibly many people.",not youth,1.0
72,climatechange,2020-01-09 17:55:55,2020.0,-1,That is indeed censorship.,youth,0.94


## **Classify Comments from ClimateOffensive**

In [291]:
co_comments = pd.read_csv('Data/Clean_CSV/ClimateOffensive_comments_clean.csv')

co_comments = co_comments[(co_comments['body']!='[deleted]')]
co_comments = co_comments[(co_comments['body']!='[removed]')]
co_comments.reset_index(inplace=True, drop=True)

df_test_co = co_comments.sample(n=95).reset_index(drop=True)

inputs = []
for i in range(len(df_test_co)):
    inputs.append(df_test_co.loc[i,'body']) 

response = cohere_client.classify(
    inputs=inputs,
    examples=examples,
)

predictions = [item.prediction for item in response.classifications]
df_test_co['prediction'] = predictions

confidence = [round(item.confidence,2) for item in response.classifications]
df_test_co['confidence'] = confidence

df_test_co.to_csv('Data/Labeled/ClimateOffensive_Labeled.csv',index=False)

In [296]:
df_test_co.sample(n=3)

Unnamed: 0,subreddit,created_datetime,year,score,body,prediction,confidence
27,ClimateOffensive,2019-06-07 20:09:41,2019,3,Why not just bring your own coffee cups? Don‚Äôt offer any cups.,not youth,0.82
24,ClimateOffensive,2019-02-13 12:44:23,2019,1,"I'm going to make my own sub, one where freedom of speech is prevalent.",youth,0.73
80,ClimateOffensive,2022-05-29 04:31:39,2022,5,"Thats a highly flawed study. Voters tend to be older and more conservative compared to non-voters. This study is claiming (or at least strongly implying) a causative affect that cant be shown, and is very likely false due to the populations theyre comparing. If older and more conservative people's policy opinions align better with corporate interest, then their opinions will be better represented.\n\nhttps://www.pewresearch.org/2010/10/29/the-party-of-nonvoters/",not youth,1.0


## **Classify Comments from Anxiety Help**

In [298]:
ax_comments = pd.read_csv('Data/Clean_CSV/Anxietyhelp_comments_clean.csv')

ax_comments = ax_comments[(ax_comments['body']!='[deleted]')]
ax_comments = ax_comments[(ax_comments['body']!='[removed]')]
ax_comments.reset_index(inplace=True, drop=True)

df_test_ax = ax_comments.sample(n=95).reset_index(drop=True)

inputs = []
for i in range(len(df_test_ax)):
    inputs.append(df_test_ax.loc[i,'body']) 

response = cohere_client.classify(
    inputs=inputs,
    examples=examples,
)

predictions = [item.prediction for item in response.classifications]
df_test_ax['prediction'] = predictions

confidence = [round(item.confidence,2) for item in response.classifications]
df_test_ax['confidence'] = confidence

df_test_ax.to_csv('Data/Labeled/Anxietyhelp_Labeled.csv',index=False)

In [326]:
df_test_ax.sample(n=3)

Unnamed: 0,subreddit,created_datetime,year,score,body,prediction,confidence
36,Anxietyhelp,2020-12-17 01:03:44,2020,1,"Thank you! We‚Äôve briefly talked about how I have anxiety and how I started therapy this year. But yes, one thing my therapist said was to speak more on how I feel and be a bit more vulnerable so others could understand better.",not youth,1.0
26,Anxietyhelp,2021-05-24 03:11:42,2021,1,I just got my license the week of derby so I‚Äôm going through the same adjustment. I drive the roads I‚Äôve driven with my parents a few times then I drive them by myself and slowly expand to new streets and directions.,youth,0.61
66,Anxietyhelp,2022-07-14 04:51:37,2022,2,I'm 41 and I still have no license. Fear of driving and and being used to Uber.,not youth,0.96
