# Init

In [1]:
import pandas as pd
import numpy as np
import sidetable 
import os

np.random.seed(67)

In [2]:
def top_terms_per_segment(summary_table, segment, top_n):
    list_of_segments = np.unique(summary_table[segment])
    top_terms = []
    
    for group in list_of_segments:
        filtered_table = summary_table[summary_table[segment] == group].sort_values('count', ascending = False)
        most_frequent_terms = pd.Series(filtered_table.Label.values)
        most_frequent_terms.drop_duplicates(inplace = True)
        #print(most_frequent_terms[0:top_n])
        most_frequent_terms = most_frequent_terms[0:top_n] 
        most_frequent_terms.reset_index(inplace=True, drop=True)
        top_terms.append(most_frequent_terms)
        
    table_out = pd.DataFrame(top_terms).transpose()
    table_out.columns = list_of_segments
    
    return table_out


## Load files 

In [3]:
data = pd.read_csv('data_backup.csv')

In [49]:
community_appended_adj_list = pd.read_csv('adjacency list export with modularity class.csv')

In [50]:
community_appended_adj_list.rename({'source':'id','Target':'Label'}, axis=1, inplace=True)

In [51]:
community_appended_adj_list.head()

Unnamed: 0,id,Label,community
0,1534282318311051264,change,0
1,1534282318311051264,climate,0
2,1534282318311051264,far,0
3,1534282318311051264,global,0
4,1534282318311051264,impact,0


In [52]:
adj_list_with_brand_label = pd.read_csv('adjacency_list_for_merging.csv')

In [53]:
adj_list_with_brand_label.head()

Unnamed: 0,id,Target,brand
0,1534282318311051264,change,@SierraClub
1,1534282318311051264,climate,@SierraClub
2,1534282318311051264,far,@SierraClub
3,1534282318311051264,global,@SierraClub
4,1534282318311051264,impact,@SierraClub


In [55]:
adj_list_with_brand_label.rename({'Target':'Label'}, axis=1, inplace=True)

# Append Community Labels and summarize

## Append labels

In [56]:
adj_list_with_brand_label = adj_list_with_brand_label.merge(community_appended_adj_list, how = 'left', on=['id','Label'])

In [57]:
adj_list_with_brand_label

Unnamed: 0,id,Label,brand,community
0,1534282318311051264,change,@SierraClub,0
1,1534282318311051264,climate,@SierraClub,0
2,1534282318311051264,far,@SierraClub,0
3,1534282318311051264,global,@SierraClub,0
4,1534282318311051264,impact,@SierraClub,0
...,...,...,...,...
153065,1395052880063762438,good,@NRDC,3
153066,1395052880063762438,help,@NRDC,3
153067,1395052880063762438,infrastructure,@NRDC,3
153068,1395052880063762438,job,@NRDC,3


## summary table of brands, modularity class, and terms

### Get rid of the outlier segment

In [9]:
adj_list_with_brand_label.modularity_class.value_counts(normalize=True)

7.0    0.219655
8.0    0.149447
0.0    0.131031
1.0    0.130235
5.0    0.123152
3.0    0.121116
2.0    0.088004
6.0    0.035060
4.0    0.002302
Name: modularity_class, dtype: float64

In [10]:
segments_to_include = adj_list_with_brand_label.modularity_class.value_counts(normalize=True)
segments_to_include = segments_to_include[segments_to_include.values > 0.08].index.values
segments_to_include

array([7., 8., 0., 1., 5., 3., 2.])

In [11]:
adj_list_with_brand_label = adj_list_with_brand_label[adj_list_with_brand_label.modularity_class.isin(segments_to_include)]

In [12]:
adj_list_with_brand_label.modularity_class.value_counts()

7.0    2481
8.0    1688
0.0    1480
1.0    1471
5.0    1391
3.0    1368
2.0     994
Name: modularity_class, dtype: int64

In [13]:
adj_list_with_brand_label.brand.value_counts(normalize = True)

@Refugees           0.167663
@WCKitchen          0.163984
@UNICEF             0.163432
@ICRC               0.108894
@RedCross           0.106042
@SavetheChildren    0.079187
@GlobalGiving       0.061437
@MSF                0.060241
@RESCUEorg          0.047917
@UNHumanRights      0.041203
Name: brand, dtype: float64

### Summarizing top terms for each topic  

In [14]:
adj_list_with_brand_label.stb.freq(['brand', 'modularity_class', 'Label']).sort_values('brand')

Unnamed: 0,brand,modularity_class,Label,count,percent,cumulative_count,cumulative_percent
2648,@GlobalGiving,0.0,15,1,0.009197,10873,100.000000
2540,@GlobalGiving,8.0,response,1,0.009197,10765,99.006714
2539,@GlobalGiving,8.0,romania,1,0.009197,10764,98.997517
2538,@GlobalGiving,8.0,russian,1,0.009197,10763,98.988320
2537,@GlobalGiving,8.0,shelter,1,0.009197,10762,98.979123
...,...,...,...,...,...,...,...
1632,@WCKitchen,5.0,russia,1,0.009197,9857,90.655753
1631,@WCKitchen,5.0,thing,1,0.009197,9856,90.646556
1630,@WCKitchen,5.0,use,1,0.009197,9855,90.637359
1628,@WCKitchen,7.0,16,1,0.009197,9853,90.618964


In [15]:
top_terms_by_brand_ModClass = adj_list_with_brand_label.stb.freq(['brand', 'modularity_class', 'Label' ]).sort_values('brand')
top_terms_by_brand_ModClass.to_csv('top_terms_by_brand_ModClass.csv', index=False)

In [16]:
top_terms_by_brand_ModClass[top_terms_by_brand_ModClass.modularity_class == 0].sort_values('count', ascending = False)[:20]

Unnamed: 0,brand,modularity_class,Label,count,percent,cumulative_count,cumulative_percent
7,@Refugees,0.0,refugee,71,0.652994,1020,9.381036
26,@Refugees,0.0,unhcr,33,0.303504,1853,17.042215
74,@UNICEF,0.0,19,19,0.174745,3049,28.041939
73,@UNICEF,0.0,covid,19,0.174745,3030,27.867194
98,@UNICEF,0.0,foreverychild,16,0.147153,3469,31.904718
109,@Refugees,0.0,world,15,0.137956,3637,33.44983
137,@Refugees,0.0,high,13,0.119562,4024,37.009105
135,@UNICEF,0.0,climate,13,0.119562,3998,36.769981
146,@UNICEF,0.0,school,12,0.110365,4136,38.03918
145,@UNICEF,0.0,world,12,0.110365,4124,37.928814


In [17]:
top_terms = top_terms_per_segment(top_terms_by_brand_ModClass, 'modularity_class', 25)
top_terms

Unnamed: 0,0.0,1.0,2.0,3.0,5.0,7.0,8.0
0,refugee,thank,need,red,civilian,child,wck
1,unhcr,support,home,cross,war,ukraine,people
2,19,chefjoseandre,relief,learn,rule,flee,meal
3,covid,sambloch1,life,blood,humanitarian,unicef,chefsforukraine
4,foreverychild,natemook,health,hi,right,conflict,amp
5,world,grateful,displace,team,human,year,serve
6,high,effort,crisis,food,protect,family,irc
7,climate,appreciate,fund,provide,mbachelet,old,restaurant
8,school,continue,unicefchief,donation,safe,help,fresh
9,commissioner,ongoing,worker,international,target,country,hot


### Topic names

In [18]:
communities = top_terms.columns
modclass_names = ['refugee_assistance','solidarity','humanitarian','medical','military','family','meals']

name_dict = dict(zip(communities, modclass_names))
name_dict

{0.0: 'refugee_assistance',
 1.0: 'solidarity',
 2.0: 'humanitarian',
 3.0: 'medical',
 5.0: 'military',
 7.0: 'family',
 8.0: 'meals'}

In [19]:
top_terms.columns = modclass_names
top_terms.to_csv('top_terms_by_modclass.csv', index = False)

In [20]:
top_terms

Unnamed: 0,refugee_assistance,solidarity,humanitarian,medical,military,family,meals
0,refugee,thank,need,red,civilian,child,wck
1,unhcr,support,home,cross,war,ukraine,people
2,19,chefjoseandre,relief,learn,rule,flee,meal
3,covid,sambloch1,life,blood,humanitarian,unicef,chefsforukraine
4,foreverychild,natemook,health,hi,right,conflict,amp
5,world,grateful,displace,team,human,year,serve
6,high,effort,crisis,food,protect,family,irc
7,climate,appreciate,fund,provide,mbachelet,old,restaurant
8,school,continue,unicefchief,donation,safe,help,fresh
9,commissioner,ongoing,worker,international,target,country,hot


In [21]:
adj_list_with_brand_label['topic'] = [name_dict.get(n, n) for n in adj_list_with_brand_label.modularity_class]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adj_list_with_community_label['topic'] = [name_dict.get(n, n) for n in adj_list_with_community_label.modularity_class]


In [22]:
adj_list_with_brand_label.topic.value_counts()

family                2481
meals                 1688
refugee_assistance    1480
solidarity            1471
military              1391
medical               1368
humanitarian           994
Name: topic, dtype: int64

In [23]:
adj_list_with_brand_label

Unnamed: 0,id,Label,brand,Id,timeset,modularity_class,topic
0,1500215577817886725,20,@GlobalGiving,20,,0.0,refugee_assistance
1,1500215577817886725,critical,@GlobalGiving,critical,,2.0,humanitarian
3,1500215577817886725,emergency,@GlobalGiving,emergency,,3.0,medical
4,1500215577817886725,hungary,@GlobalGiving,hungary,,8.0,meals
5,1500215577817886725,learn,@GlobalGiving,learn,,3.0,medical
...,...,...,...,...,...,...,...
11309,1493639941867200514,refugee,@WCKitchen,refugee,,0.0,refugee_assistance
11310,1493639941867200514,relief,@WCKitchen,relief,,2.0,humanitarian
11311,1493639941867200514,team,@WCKitchen,team,,3.0,medical
11312,1493639941867200514,travel,@WCKitchen,travel,,7.0,family


## Exporting files with top terms appended

In [24]:
adj_list_with_brand_label.to_csv('adj_list_with_brand_label.csv', index = False)

In [25]:
data.id = data.id.astype('str')
adj_list_with_brand_label.id = adj_list_with_brand_label.id.astype('str')

data_with_modclass = data.merge(adj_list_with_brand_label[['id', 'topic']], on = 'id', how = 'left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [26]:
data.shape

(1338, 7)

In [27]:
data_with_modclass.drop_duplicates(subset='id', inplace=True)

In [28]:
data_with_modclass.shape

(1338, 8)

In [29]:
data_with_modclass.head()

Unnamed: 0,id,created_at,text,brand,polarity_score,subjectivity_score,processed_text,topic
0,1500215577817886725,2022-03-05 21:04:01+00:00,UPDATE: Tabletochki Charity Foundation is one ...,@GlobalGiving,0.34375,0.5125,update tabletochki charity foundation 20 nonpr...,refugee_assistance
13,1499909560030662661,2022-03-05 00:48:00+00:00,The link between #war and #hunger is clear—yet...,@GlobalGiving,0.0625,0.5,link war hunger clear — overlook ukraine russi...,military
23,1499834444349386758,2022-03-04 19:49:31+00:00,@ItsFangs Thank you so much for showing your s...,@GlobalGiving,0.075,0.325,@itsfang thank support help community ukraine ...,humanitarian
35,1499833815874818052,2022-03-04 19:47:01+00:00,@Kellyrei007 Thank you so much for showing you...,@GlobalGiving,0.25,0.2,@kellyrei007 thank support standwithukraine,solidarity
38,1499833431584350216,2022-03-04 19:45:30+00:00,@jerryg125 Thank you for sharing! 🙌 🙌 #StandWi...,@GlobalGiving,0.0,0.0,@jerryg125 thank share 🙌 🙌 standwithukraine,solidarity


In [30]:
data_with_modclass.topic.value_counts(normalize=True)

solidarity            0.292998
family                0.159817
meals                 0.143075
medical               0.110350
refugee_assistance    0.108828
humanitarian          0.098174
military              0.086758
Name: topic, dtype: float64

In [31]:
data_with_modclass.to_csv('data_with_topic_added.csv', index = False)

### Summary of the Topics (e.g. ModClass) by Brand

In [32]:
adj_list_with_brand_label.pivot_table(index = 'topic', columns = 'brand', aggfunc='size')

brand,@GlobalGiving,@ICRC,@MSF,@RESCUEorg,@RedCross,@Refugees,@SavetheChildren,@UNHumanRights,@UNICEF,@WCKitchen
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
family,82,243,106,124,165,478,343,56,758,126
humanitarian,117,130,96,55,67,165,70,26,214,54
meals,78,142,128,95,115,295,76,50,148,561
medical,75,134,148,44,565,160,59,23,88,72
military,38,424,105,91,117,156,89,197,147,27
refugee_assistance,97,78,61,79,57,484,152,66,345,61
solidarity,181,33,11,33,67,85,72,30,77,882


In [33]:
adj_list_with_brand_label.pivot_table(index = 'topic', columns = 'brand', aggfunc='size').to_csv('author mentions by topic.csv', index = True)

In [34]:
top_terms_per_segment(top_terms_by_brand_ModClass, 'brand', 25)

Unnamed: 0,@GlobalGiving,@ICRC,@MSF,@RESCUEorg,@RedCross,@Refugees,@SavetheChildren,@UNHumanRights,@UNICEF,@WCKitchen
0,thank,ukraine,ukraine,ukraine,red,ukraine,child,support,child,thank
1,support,people,team,thank,cross,refugee,ukraine,right,ukraine,support
2,ukraine,civilian,people,irc,ukraine,people,support,human,unicef,chefjoseandre
3,generosity,war,emergency,humanitarian,learn,flee,family,amp,support,wck
4,relief,conflict,msf,crisis,blood,unhcr,conflict,mbachelet,conflict,sambloch1
5,fund,rule,medical,need,help,help,kid,ukraine,need,meal
6,standwithukraine,humanitarian,need,people,hi,support,thank,civilian,year,grateful
7,people,help,response,support,team,country,help,rights,family,natemook
8,share,need,conflict,help,thank,force,learn,people,old,chefsforukraine
9,partner,aid,access,refugeeswelcome,food,million,education,action,help,effort


In [35]:
top_terms_per_segment(top_terms_by_brand_ModClass, 'brand', 25).to_csv('top terms by author.csv', index=True)