# Init

In [1]:
import pandas as pd
import numpy as np
import sidetable 
import os

np.random.seed(67)

In [2]:
def top_terms_per_segment(summary_table, segment, top_n):
    list_of_segments = np.unique(summary_table[segment])
    top_terms = []
    
    for group in list_of_segments:
        filtered_table = summary_table[summary_table[segment] == group].sort_values('count', ascending = False)
        most_frequent_terms = pd.Series(filtered_table.Label.values)
        most_frequent_terms.drop_duplicates(inplace = True)
        #print(most_frequent_terms[0:top_n])
        most_frequent_terms = most_frequent_terms[0:top_n] 
        most_frequent_terms.reset_index(inplace=True, drop=True)
        top_terms.append(most_frequent_terms)
        
    table_out = pd.DataFrame(top_terms).transpose()
    table_out.columns = list_of_segments
    
    return table_out


## Load files 

In [3]:
data = pd.read_csv('data_backup.csv')

In [4]:
community_appended_adj_list = pd.read_csv('adjacency list export with modularity class.csv')

In [5]:
community_appended_adj_list.rename({'source':'id','Target':'Label'}, axis=1, inplace=True)

In [6]:
community_appended_adj_list.head()

Unnamed: 0,id,Label,community
0,1540683731630927874,increase,0
1,1540683731630927874,new,0
2,1540683731630927874,reduce,0
3,1540683731630927874,study,0
4,1540366146423095296,check,1


In [7]:
adj_list_with_brand_label = pd.read_csv('adjacency_list_for_merging.csv')

In [8]:
adj_list_with_brand_label.head()

Unnamed: 0,id,Target,brand
0,1540683731630927874,increase,@ewg
1,1540683731630927874,new,@ewg
2,1540683731630927874,reduce,@ewg
3,1540683731630927874,study,@ewg
4,1540366146423095296,check,@ewg


In [9]:
adj_list_with_brand_label.rename({'Target':'Label'}, axis=1, inplace=True)

# Append Community Labels and summarize

## Append labels

In [10]:
adj_list_with_brand_label = adj_list_with_brand_label.merge(community_appended_adj_list, how = 'left', on=['id','Label'])

In [11]:
adj_list_with_brand_label

Unnamed: 0,id,Label,brand,community
0,1540683731630927874,increase,@ewg,0
1,1540683731630927874,new,@ewg,0
2,1540683731630927874,reduce,@ewg,0
3,1540683731630927874,study,@ewg,0
4,1540366146423095296,check,@ewg,1
...,...,...,...,...
85083,1397962840724496386,human,@NRDC,0
85084,1397962840724496386,learn,@NRDC,0
85085,1397962839894085633,000,@NRDC,0
85086,1397962839894085633,new,@NRDC,0


## summary table of brands, modularity class, and terms

### Get rid of the outlier segment

In [12]:
adj_list_with_brand_label.rename({'community':'modularity_class'},axis='columns',inplace=True)

In [13]:
adj_list_with_brand_label.modularity_class.value_counts(normalize=True)

3    0.191484
0    0.179332
6    0.166910
2    0.162737
4    0.103622
1    0.087086
5    0.074006
7    0.034823
Name: modularity_class, dtype: float64

In [14]:
segments_to_include = adj_list_with_brand_label.modularity_class.value_counts(normalize=True)
segments_to_include = segments_to_include[segments_to_include.values > 0.08].index.values
segments_to_include

array([3, 0, 6, 2, 4, 1])

In [15]:
adj_list_with_brand_label = adj_list_with_brand_label[adj_list_with_brand_label.modularity_class.isin(segments_to_include)]

In [16]:
adj_list_with_brand_label.modularity_class.value_counts()

3    16293
0    15259
6    14202
2    13847
4     8817
1     7410
Name: modularity_class, dtype: int64

In [17]:
modularity_class_sort_order = adj_list_with_brand_label.modularity_class.value_counts().index

In [18]:
adj_list_with_brand_label.brand.value_counts(normalize = True)

@foe_us            0.215105
@Earthjustice      0.211571
@NRDC              0.200968
@EnvDefenseFund    0.199504
@ewg               0.172852
Name: brand, dtype: float64

In [19]:
# same process as modularity class sort order
author_sort_order = adj_list_with_brand_label.brand.value_counts().index.values

### Summarizing top terms for each topic  

In [20]:
adj_list_with_brand_label.stb.freq(['brand', 'modularity_class', 'Label']).sort_values('brand')

Unnamed: 0,brand,modularity_class,Label,count,percent,cumulative_count,cumulative_percent
4917,@Earthjustice,0,2021,1,0.001319,75828,100.000000
1725,@Earthjustice,6,come,12,0.015825,60399,79.652635
1724,@Earthjustice,6,lease,12,0.015825,60387,79.636810
1619,@Earthjustice,0,day,13,0.017144,59127,77.975154
1618,@Earthjustice,0,fuel,13,0.017144,59114,77.958010
...,...,...,...,...,...,...,...
1761,@foe_us,6,continue,11,0.014507,60828,80.218389
1760,@foe_us,6,like,11,0.014507,60817,80.203882
1759,@foe_us,6,toxic,11,0.014507,60806,80.189376
2075,@foe_us,4,potus,9,0.011869,64100,84.533418


In [21]:
top_terms_by_brand_ModClass = adj_list_with_brand_label.stb.freq(['brand', 'modularity_class', 'Label' ]).sort_values('brand')
top_terms_by_brand_ModClass.to_csv('top_terms_by_brand_ModClass.csv', index=False)

In [22]:
top_terms_by_brand_ModClass[top_terms_by_brand_ModClass.modularity_class == 0].sort_values('count', ascending = False)[:20]

Unnamed: 0,brand,modularity_class,Label,count,percent,cumulative_count,cumulative_percent
2,@EnvDefenseFund,0,climate,493,0.650156,1585,2.090257
9,@EnvDefenseFund,0,change,265,0.349475,3889,5.128712
19,@EnvDefenseFund,0,emission,203,0.267711,6152,8.113098
30,@EnvDefenseFund,0,new,189,0.249248,8300,10.945825
32,@EnvDefenseFund,0,global,188,0.24793,8676,11.441684
37,@NRDC,0,climate,182,0.240017,9600,12.660231
38,@EnvDefenseFund,0,warming,180,0.237379,9780,12.89761
50,@Earthjustice,0,climate,147,0.19386,11720,15.456032
54,@EnvDefenseFund,0,methane,132,0.174078,12274,16.186633
56,@foe_us,0,climate,129,0.170122,12534,16.529514


In [23]:
top_terms = top_terms_per_segment(top_terms_by_brand_ModClass, 'modularity_class', 25)
top_terms = top_terms.reindex(columns=modularity_class_sort_order)
top_terms

Unnamed: 0,3,0,6,2,4,1
0,toxic,climate,fuel,oil,community,climate
1,pfas,change,clean,big,environmental,year
2,food,emission,fossil,amp,nrdc,river
3,safe,new,energy,biden,climate,change
4,foreverchemical,global,pollution,climate,justice,000
5,chemical,warming,climate,land,work,high
6,product,methane,community,administration,read,amp
7,exposure,gas,air,public,amp,specie
8,vote,world,electric,potus,fight,time
9,lead,carbon,industry,gas,learn,large


### Topic names

In [24]:
communities = top_terms.columns
modclass_names = ['pollution','climate change','clean energy','drilling rights','community outreach','natural habits']

name_dict = dict(zip(communities, modclass_names))
name_dict

{3: 'pollution',
 0: 'climate change',
 6: 'clean energy',
 2: 'drilling rights',
 4: 'community outreach',
 1: 'natural habits'}

In [25]:
top_terms.columns = modclass_names
top_terms.to_csv('top_terms_by_modclass.csv', index = False)

In [26]:
top_terms

Unnamed: 0,pollution,climate change,clean energy,drilling rights,community outreach,natural habits
0,toxic,climate,fuel,oil,community,climate
1,pfas,change,clean,big,environmental,year
2,food,emission,fossil,amp,nrdc,river
3,safe,new,energy,biden,climate,change
4,foreverchemical,global,pollution,climate,justice,000
5,chemical,warming,climate,land,work,high
6,product,methane,community,administration,read,amp
7,exposure,gas,air,public,amp,specie
8,vote,world,electric,potus,fight,time
9,lead,carbon,industry,gas,learn,large


In [27]:
adj_list_with_brand_label['topic'] = [name_dict.get(n, n) for n in adj_list_with_brand_label.modularity_class]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adj_list_with_brand_label['topic'] = [name_dict.get(n, n) for n in adj_list_with_brand_label.modularity_class]


In [28]:
adj_list_with_brand_label.topic.value_counts()

pollution             16293
climate change        15259
clean energy          14202
drilling rights       13847
community outreach     8817
natural habits         7410
Name: topic, dtype: int64

In [29]:
adj_list_with_brand_label

Unnamed: 0,id,Label,brand,modularity_class,topic
0,1540683731630927874,increase,@ewg,0,climate change
1,1540683731630927874,new,@ewg,0,climate change
2,1540683731630927874,reduce,@ewg,0,climate change
3,1540683731630927874,study,@ewg,0,climate change
4,1540366146423095296,check,@ewg,1,natural habits
...,...,...,...,...,...
85083,1397962840724496386,human,@NRDC,0,climate change
85084,1397962840724496386,learn,@NRDC,0,climate change
85085,1397962839894085633,000,@NRDC,0,climate change
85086,1397962839894085633,new,@NRDC,0,climate change


## Exporting files with top terms appended

In [30]:
adj_list_with_brand_label.to_csv('adj_list_with_brand_label.csv', index = False)

In [31]:
data.id = data.id.astype('str')
adj_list_with_brand_label.id = adj_list_with_brand_label.id.astype('str')

data_with_modclass = data.merge(adj_list_with_brand_label[['id', 'topic']], on = 'id', how = 'left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [32]:
data.shape

(13619, 7)

In [33]:
data_with_modclass.drop_duplicates(subset='id', inplace=True)

In [34]:
data_with_modclass.shape

(13619, 8)

In [35]:
data_with_modclass.head()

Unnamed: 0,id,created_at,text,brand,polarity_score,subjectivity_score,processed_text,topic
0,1540683731630927874,2022-06-25 13:10:00+00:00,A new study in @nature shows that increasing t...,@ewg,0.136364,0.454545,new study @nature increase sustainability scho...,climate change
4,1540366146423095296,2022-06-24 16:08:02+00:00,"No matter how you celebrate, we're wishing you...",@ewg,1.0,1.0,matter celebrate wish wonderful independence d...,natural habits
8,1540089567218470918,2022-06-23 21:49:00+00:00,Farmers and ranchers can take important steps ...,@ewg,0.6,0.575,farmer rancher important step lower emission h...,climate change
18,1540063656586756100,2022-06-23 20:06:03+00:00,A handful of recent FDA decisions allowed seve...,@ewg,0.2,0.33,handful recent fda decision allow type phthala...,pollution
22,1540047792634224642,2022-06-23 19:03:01+00:00,Provisions to tackle #PFAS are included in the...,@ewg,0.1375,0.4875,provision tackle pfa include ndaa fy 2023 repr...,


In [36]:
data_with_modclass.topic.value_counts(normalize=True)

pollution             0.210047
climate change        0.204938
clean energy          0.166454
drilling rights       0.158025
community outreach    0.134440
natural habits        0.126096
Name: topic, dtype: float64

In [37]:
topic_sort_order = data_with_modclass.topic.value_counts().index.values

In [38]:
data_with_modclass.to_csv('data_with_topic_added.csv', index = False)

### Summary of the Topics (e.g. ModClass) by Brand

In [39]:
adj_list_with_brand_label.pivot_table(index = 'topic', columns = 'brand', aggfunc='size')

brand,@Earthjustice,@EnvDefenseFund,@NRDC,@ewg,@foe_us
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
clean energy,4009,3572,3485,654,2482
climate change,2046,6083,2747,1406,2977
community outreach,2070,1347,2913,818,1669
drilling rights,4111,1400,2122,453,5761
natural habits,1683,1758,1242,497,2230
pollution,2124,968,2730,9279,1192


In [40]:
adj_list_with_brand_label.pivot_table(index = 'topic', columns = 'brand', aggfunc='size').reindex(topic_sort_order, columns=author_sort_order).to_csv('author mentions by topic.csv', index = True)

In [41]:
top_terms_per_segment(top_terms_by_brand_ModClass, 'brand', 25)

Unnamed: 0,@Earthjustice,@EnvDefenseFund,@NRDC,@ewg,@foe_us
0,biden,climate,climate,toxic,oil
1,fuel,change,clean,pfas,big
2,oil,emission,energy,food,amp
3,clean,clean,community,safe,biden
4,administration,new,water,foreverchemical,climate
5,fossil,global,environmental,chemical,land
6,pollution,warming,nrdc,product,fuel
7,energy,energy,fuel,exposure,fossil
8,climate,methane,justice,vote,public
9,federal,gas,fossil,lead,potus


In [42]:
top_terms_per_segment(top_terms_by_brand_ModClass, 'brand', 25).reindex(columns=author_sort_order).to_csv('top terms by author.csv', index=True)