# Init

In [1]:
import pandas as pd
import numpy as np
import sidetable 
import os

np.random.seed(67)

In [2]:
def top_terms_per_segment(summary_table, segment, top_n):
    list_of_segments = np.unique(summary_table[segment])
    top_terms = []
    
    for group in list_of_segments:
        filtered_table = summary_table[summary_table[segment] == group].sort_values('count', ascending = False)
        most_frequent_terms = pd.Series(filtered_table.Label.values)
        most_frequent_terms.drop_duplicates(inplace = True)
        #print(most_frequent_terms[0:top_n])
        most_frequent_terms = most_frequent_terms[0:top_n] 
        most_frequent_terms.reset_index(inplace=True, drop=True)
        top_terms.append(most_frequent_terms)
        
    table_out = pd.DataFrame(top_terms).transpose()
    table_out.columns = list_of_segments
    
    return table_out


## Load files 

In [3]:
data = pd.read_csv('data_backup.csv')

In [4]:
community_appended_adj_list = pd.read_csv('adjacency list export with modularity class.csv')

In [5]:
community_appended_adj_list.rename({'source':'id','Target':'Label'}, axis=1, inplace=True)

In [6]:
community_appended_adj_list.head()

Unnamed: 0,id,Label,community
0,1534282318311051264,change,0
1,1534282318311051264,climate,0
2,1534282318311051264,far,0
3,1534282318311051264,global,0
4,1534282318311051264,impact,0


In [7]:
adj_list_with_brand_label = pd.read_csv('adjacency_list_for_merging.csv')

In [8]:
adj_list_with_brand_label.head()

Unnamed: 0,id,Target,brand
0,1534282318311051264,change,@SierraClub
1,1534282318311051264,climate,@SierraClub
2,1534282318311051264,far,@SierraClub
3,1534282318311051264,global,@SierraClub
4,1534282318311051264,impact,@SierraClub


In [9]:
adj_list_with_brand_label.rename({'Target':'Label'}, axis=1, inplace=True)

# Append Community Labels and summarize

## Append labels

In [10]:
adj_list_with_brand_label = adj_list_with_brand_label.merge(community_appended_adj_list, how = 'left', on=['id','Label'])

In [11]:
adj_list_with_brand_label

Unnamed: 0,id,Label,brand,community
0,1534282318311051264,change,@SierraClub,0
1,1534282318311051264,climate,@SierraClub,0
2,1534282318311051264,far,@SierraClub,0
3,1534282318311051264,global,@SierraClub,0
4,1534282318311051264,impact,@SierraClub,0
...,...,...,...,...
153065,1395052880063762438,good,@NRDC,3
153066,1395052880063762438,help,@NRDC,3
153067,1395052880063762438,infrastructure,@NRDC,3
153068,1395052880063762438,job,@NRDC,3


## summary table of brands, modularity class, and terms

### Get rid of the outlier segment

In [12]:
adj_list_with_brand_label.rename({'community':'modularity_class'},axis='columns',inplace=True)

In [13]:
adj_list_with_brand_label.modularity_class.value_counts(normalize=True)

0     0.168701
6     0.153152
9     0.139511
1     0.126197
4     0.110910
3     0.080270
7     0.066349
8     0.059724
10    0.046475
2     0.025916
5     0.022793
Name: modularity_class, dtype: float64

In [14]:
segments_to_include = adj_list_with_brand_label.modularity_class.value_counts(normalize=True)
segments_to_include = segments_to_include[segments_to_include.values > 0.08].index.values
segments_to_include

array([0, 6, 9, 1, 4, 3])

In [15]:
adj_list_with_brand_label = adj_list_with_brand_label[adj_list_with_brand_label.modularity_class.isin(segments_to_include)]

In [16]:
adj_list_with_brand_label.modularity_class.value_counts()

0    25823
6    23443
9    21355
1    19317
4    16977
3    12287
Name: modularity_class, dtype: int64

In [17]:
adj_list_with_brand_label.brand.value_counts(normalize = True)

@Earthjustice      0.138412
@foe_us            0.137431
@NRDC              0.127775
@ewg               0.117590
@greenpeaceusa     0.098237
@SierraClub        0.093883
@UCSUSA            0.087230
@OurOcean          0.084428
@RnfrstAlliance    0.062558
@earthisland       0.052457
Name: brand, dtype: float64

### Summarizing top terms for each topic  

In [18]:
adj_list_with_brand_label.stb.freq(['brand', 'modularity_class', 'Label']).sort_values('brand')

Unnamed: 0,brand,modularity_class,Label,count,percent,cumulative_count,cumulative_percent
12411,@Earthjustice,0,50,1,0.000839,119202,100.000000
3562,@Earthjustice,3,job,9,0.007550,90191,75.662321
3561,@Earthjustice,3,new,9,0.007550,90182,75.654771
3560,@Earthjustice,3,tackle,9,0.007550,90173,75.647221
3559,@Earthjustice,4,country,9,0.007550,90164,75.639670
...,...,...,...,...,...,...,...
3591,@greenpeaceusa,6,trump,8,0.006711,90442,75.872888
3592,@greenpeaceusa,6,report,8,0.006711,90450,75.879599
3593,@greenpeaceusa,6,learn,8,0.006711,90458,75.886311
3585,@greenpeaceusa,9,today,8,0.006711,90394,75.832620


In [19]:
top_terms_by_brand_ModClass = adj_list_with_brand_label.stb.freq(['brand', 'modularity_class', 'Label' ]).sort_values('brand')
top_terms_by_brand_ModClass.to_csv('top_terms_by_brand_ModClass.csv', index=False)

In [20]:
top_terms_by_brand_ModClass[top_terms_by_brand_ModClass.modularity_class == 0].sort_values('count', ascending = False)[:20]

Unnamed: 0,brand,modularity_class,Label,count,percent,cumulative_count,cumulative_percent
12,@greenpeaceusa,0,fuel,300,0.251674,4908,4.117381
13,@greenpeaceusa,0,climate,294,0.24664,5202,4.364021
15,@greenpeaceusa,0,fossil,285,0.23909,5774,4.843878
16,@Earthjustice,0,climate,278,0.233218,6052,5.077096
20,@NRDC,0,climate,256,0.214761,7087,5.94537
21,@foe_us,0,climate,232,0.194628,7319,6.139998
31,@foe_us,0,fuel,195,0.163588,9461,7.936947
32,@foe_us,0,fossil,191,0.160232,9652,8.09718
34,@SierraClub,0,climate,190,0.159393,10033,8.416805
37,@Earthjustice,0,fuel,186,0.156038,10595,8.888274


In [21]:
top_terms = top_terms_per_segment(top_terms_by_brand_ModClass, 'modularity_class', 25)
top_terms

Unnamed: 0,0,1,3,4,6,9
0,fuel,ocean,climate,earth,oil,toxic
1,climate,sea,act,island,amp,pfas
2,fossil,forest,pass,ocean,biden,food
3,energy,rainforest,justice,friend,gas,foreverchemical
4,clean,know,job,ucs,big,safe
5,crisis,world,congress,environmental,climate,chemical
6,need,people,amp,learn,administration,product
7,action,nature,investment,today,land,exposure
8,amp,work,community,week,public,vote
9,time,community,infrastructure,look,million,lead


### Topic names

In [22]:
communities = top_terms.columns
modclass_names = ['fossil_fuels','natural_habitats','climate_legislation','earth_community','biden_administration','pollution']

name_dict = dict(zip(communities, modclass_names))
name_dict

{0: 'fossil_fuels',
 1: 'natural_habitats',
 3: 'climate_legislation',
 4: 'earth_community',
 6: 'biden_administration',
 9: 'pollution'}

In [23]:
top_terms.columns = modclass_names
top_terms.to_csv('top_terms_by_modclass.csv', index = False)

In [24]:
top_terms

Unnamed: 0,fossil_fuels,natural_habitats,climate_legislation,earth_community,biden_administration,pollution
0,fuel,ocean,climate,earth,oil,toxic
1,climate,sea,act,island,amp,pfas
2,fossil,forest,pass,ocean,biden,food
3,energy,rainforest,justice,friend,gas,foreverchemical
4,clean,know,job,ucs,big,safe
5,crisis,world,congress,environmental,climate,chemical
6,need,people,amp,learn,administration,product
7,action,nature,investment,today,land,exposure
8,amp,work,community,week,public,vote
9,time,community,infrastructure,look,million,lead


In [25]:
adj_list_with_brand_label['topic'] = [name_dict.get(n, n) for n in adj_list_with_brand_label.modularity_class]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adj_list_with_brand_label['topic'] = [name_dict.get(n, n) for n in adj_list_with_brand_label.modularity_class]


In [26]:
adj_list_with_brand_label.topic.value_counts()

fossil_fuels            25823
biden_administration    23443
pollution               21355
natural_habitats        19317
earth_community         16977
climate_legislation     12287
Name: topic, dtype: int64

In [27]:
adj_list_with_brand_label

Unnamed: 0,id,Label,brand,modularity_class,topic
0,1534282318311051264,change,@SierraClub,0,fossil_fuels
1,1534282318311051264,climate,@SierraClub,0,fossil_fuels
2,1534282318311051264,far,@SierraClub,0,fossil_fuels
3,1534282318311051264,global,@SierraClub,0,fossil_fuels
4,1534282318311051264,impact,@SierraClub,0,fossil_fuels
...,...,...,...,...,...
153065,1395052880063762438,good,@NRDC,3,climate_legislation
153066,1395052880063762438,help,@NRDC,3,climate_legislation
153067,1395052880063762438,infrastructure,@NRDC,3,climate_legislation
153068,1395052880063762438,job,@NRDC,3,climate_legislation


## Exporting files with top terms appended

In [28]:
adj_list_with_brand_label.to_csv('adj_list_with_brand_label.csv', index = False)

In [29]:
data.id = data.id.astype('str')
adj_list_with_brand_label.id = adj_list_with_brand_label.id.astype('str')

data_with_modclass = data.merge(adj_list_with_brand_label[['id', 'topic']], on = 'id', how = 'left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [30]:
data.shape

(23817, 7)

In [31]:
data_with_modclass.drop_duplicates(subset='id', inplace=True)

In [32]:
data_with_modclass.shape

(23817, 8)

In [33]:
data_with_modclass.head()

Unnamed: 0,id,created_at,text,brand,polarity_score,subjectivity_score,processed_text,topic
0,1534282318311051264,2022-06-07 21:13:04+00:00,Global Impact's a podcast about amplifying voi...,@SierraClub,0.025,0.25,global impact podcast amplify voice global gra...,fossil_fuels
6,1534252609468518402,2022-06-07 19:15:01+00:00,"Important piece from @VFWHQ 👇👇👇\n\n""Nature-bas...",@SierraClub,0.257143,0.657143,important piece @vfwhq 👇 👇 👇 nature base progr...,
7,1534246717301874688,2022-06-07 18:51:36+00:00,"Paid for by Sierra Club Independent Action, ht...",@SierraClub,0.05,0.1125,pay sierra club independent action authorize c...,fossil_fuels
8,1534218707739746304,2022-06-07 17:00:18+00:00,The attack on our democracy didn’t end on Janu...,@SierraClub,-0.3,0.4,attack democracy end january 6 2021 voter supp...,pollution
14,1534193123110289410,2022-06-07 15:18:39+00:00,"The communities in and around El Paso, TX have...",@SierraClub,0.166667,0.333333,community el paso tx work conserve castner ran...,natural_habitats


In [34]:
data_with_modclass.topic.value_counts(normalize=True)

natural_habitats        0.206362
earth_community         0.186769
fossil_fuels            0.182043
pollution               0.173227
biden_administration    0.164755
climate_legislation     0.086844
Name: topic, dtype: float64

In [35]:
data_with_modclass.to_csv('data_with_topic_added.csv', index = False)

### Summary of the Topics (e.g. ModClass) by Brand

In [36]:
adj_list_with_brand_label.pivot_table(index = 'topic', columns = 'brand', aggfunc='size')

brand,@Earthjustice,@NRDC,@OurOcean,@RnfrstAlliance,@SierraClub,@UCSUSA,@earthisland,@ewg,@foe_us,@greenpeaceusa
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
biden_administration,5284,2320,816,315,2173,1638,746,681,7266,2204
climate_legislation,1128,2717,1108,583,2215,2266,186,282,882,920
earth_community,1251,1721,2955,1255,1251,1885,3436,719,1002,1502
fossil_fuels,4113,4039,1406,989,3344,2546,659,840,3199,4688
natural_habitats,2493,2189,3322,3442,1311,878,830,991,2082,1779
pollution,2230,2245,457,873,897,1185,396,10504,1951,617


In [37]:
adj_list_with_brand_label.pivot_table(index = 'topic', columns = 'brand', aggfunc='size').to_csv('author mentions by topic.csv', index = True)

In [38]:
top_terms_per_segment(top_terms_by_brand_ModClass, 'brand', 25)

Unnamed: 0,@Earthjustice,@NRDC,@OurOcean,@RnfrstAlliance,@SierraClub,@UCSUSA,@earthisland,@ewg,@foe_us,@greenpeaceusa
0,climate,climate,ocean,forest,climate,climate,earth,toxic,oil,fuel
1,oil,energy,sea,rainforest,energy,ucs,island,pfas,amp,climate
2,biden,clean,friend,world,clean,act,environmental,food,biden,fossil
3,administration,need,know,people,amp,energy,week,foreverchemical,gas,oil
4,fuel,fuel,climate,nature,fuel,change,join,safe,climate,need
5,gas,action,learn,climate,fossil,pass,new,chemical,big,action
6,fossil,time,today,work,justice,congress,plastic,product,fuel,energy
7,energy,pass,look,community,gas,need,climate,exposure,fossil,change
8,clean,fossil,marine,tree,job,clean,issue,vote,land,crisis
9,crisis,future,day,sustainable,pass,biden,work,lead,public,time


In [39]:
top_terms_per_segment(top_terms_by_brand_ModClass, 'brand', 25).to_csv('top terms by author.csv', index=True)