# Init: Load Libraries and Functions

In [1]:
from collections import defaultdict 
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import string
from textblob import TextBlob  
import os
import re


#random seed for reproducibility
np.random.seed(67)

In [2]:
def polarity_scorer(input_text):
    """This function operates on a column in a data frame using apply().
    Takes a column as an input and returns a tuple of the polarity score and subjectivity score
    use .tolist() to split into separate columns, like here: https://stackoverflow.com/questions/29550414/how-to-split-column-of-tuples-in-pandas-dataframe"""
    text = TextBlob(input_text)
    polarity_score = text.sentiment.polarity
    subjectivity_score = text.sentiment.subjectivity
    
    return polarity_score, subjectivity_score

In [3]:
nlp = spacy.load('en_core_web_lg')
punctuations = string.punctuation #this is a python module which contains all the punctuations characters in English (and probably other languages too)
stopwords = list(STOP_WORDS)

def spacy_tokenizer(input_text):
    """removes stop words and punctuation from a document, converts all tokens to lower case
    and combines all tokens into one string.
    used in this example it appends a new column to a dataframe through apply()"""
    processed_text = re.sub(r"http\S+", '', input_text) # remove URLS, https://stackoverflow.com/questions/24399820/expression-to-remove-url-links-from-twitter-tweet
    mytokens = nlp(processed_text)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [4]:
# removing stopwords, it's this easy: https://medium.com/@makcedward/nlp-pipeline-stop-words-part-5-d6770df8a936

def token_parser(input_text):
    """This function creates a table with the text token and parts of a speech for a piece of text
    Builds one to many relationships by expanding all the tokens within a single document"""
    table = pd.DataFrame()
    doc = nlp(input_text)
    text = [token.text for token in doc if not token.is_stop]
    pos = [token.pos_ for token in doc if not token.is_stop]
    table['text'] = text
    table['pos'] = pos
    
    return table

In [5]:
def wordmatrix_to_dataframe(wordmatrix, feature_names):

    """This function takes a word matrix from SK-Learn and turns into a dataframe"""
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wordmatrix)]
    df = pd.DataFrame(data=wordmatrix.toarray(), #index = doc_names,
                     columns = feature_names)
    
    return df

In [6]:
def create_adjacency_list(adjacency_matrix):
    
    """This function takes a dataframe of  a dataframe"""
    table_out = pd.DataFrame(columns = ['id', 'target'])
    
    for i in range(len(adjacency_matrix)):

        filtered_table = pd.DataFrame(adjacency_matrix.iloc[i])
        filtered_table = filtered_table.transpose()
        filtered_table = filtered_table.loc[:, ~(filtered_table==0).any(axis=0)]
        
        
        #for column in filtered_table.columns:
        temp_table = pd.DataFrame(columns = ['id', 'target'])
        source = [filtered_table.index.values[0] for column in filtered_table.columns]
        target = [column for column in filtered_table.columns]
        
        temp_table['id'] = source
        temp_table['target'] = target
            
        table_out = table_out.append(temp_table)
 
    return table_out
    

# Load files and tokenize

In [7]:
path = 'exports/'
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path,f))]


In [8]:
files

['@SierraClub_tweets.csv',
 '@greenpeaceusa_tweets.csv',
 '@ewg_tweets.csv',
 '@Earthjustice_tweets.csv',
 '@earthisland_tweets.csv',
 '@RnfrstAlliance_tweets.csv',
 '@OurOcean_tweets.csv',
 '@foe_us_tweets.csv',
 '@UCSUSA_tweets.csv',
 '@NRDC_tweets.csv']

In [9]:
text_to_drop = '_tweets.csv'
all_files = pd.DataFrame(columns = ['id', 'created_at', 'text', 'brand'])

for file in files:
    each_file = pd.read_csv(path+file)
    each_file['brand'] = file.replace(text_to_drop, '') #https://www.journaldev.com/23674/python-remove-character-from-string
    all_files = all_files.append(each_file)
    
all_files.shape

(23817, 4)

In [10]:
all_files.created_at = pd.to_datetime(all_files.created_at)

In [11]:
# all_files = all_files.loc[all_files.created_at > "2022-02-15"] use this cell if filterting by date

In [12]:
all_files.to_csv('data_backup.csv', index = False)

In [13]:
data = all_files.copy()

In [14]:
data.head()

Unnamed: 0,id,created_at,text,brand
0,1534282318311051264,2022-06-07 21:13:04+00:00,Global Impact's a podcast about amplifying voi...,@SierraClub
1,1534252609468518402,2022-06-07 19:15:01+00:00,"Important piece from @VFWHQ 👇👇👇\n\n""Nature-bas...",@SierraClub
2,1534246717301874688,2022-06-07 18:51:36+00:00,"Paid for by Sierra Club Independent Action, ht...",@SierraClub
3,1534218707739746304,2022-06-07 17:00:18+00:00,The attack on our democracy didn’t end on Janu...,@SierraClub
4,1534193123110289410,2022-06-07 15:18:39+00:00,"The communities in and around El Paso, TX have...",@SierraClub


In [15]:
data.brand.value_counts()

@foe_us            2872
@Earthjustice      2808
@OurOcean          2801
@NRDC              2677
@ewg               2511
@RnfrstAlliance    2453
@greenpeaceusa     2305
@UCSUSA            2170
@SierraClub        1844
@earthisland       1376
Name: brand, dtype: int64

## Sentiment Analysis 

In [16]:
# calls functions through apply(), returns a tuple then splits the results into 2 columns
data.text = data.text.astype('str') 

data[['polarity_score', 'subjectivity_score']] = pd.DataFrame(data.text.apply(polarity_scorer).tolist(), index = data.index)

data.head()

Unnamed: 0,id,created_at,text,brand,polarity_score,subjectivity_score
0,1534282318311051264,2022-06-07 21:13:04+00:00,Global Impact's a podcast about amplifying voi...,@SierraClub,0.025,0.25
1,1534252609468518402,2022-06-07 19:15:01+00:00,"Important piece from @VFWHQ 👇👇👇\n\n""Nature-bas...",@SierraClub,0.257143,0.657143
2,1534246717301874688,2022-06-07 18:51:36+00:00,"Paid for by Sierra Club Independent Action, ht...",@SierraClub,0.05,0.1125
3,1534218707739746304,2022-06-07 17:00:18+00:00,The attack on our democracy didn’t end on Janu...,@SierraClub,-0.3,0.4
4,1534193123110289410,2022-06-07 15:18:39+00:00,"The communities in and around El Paso, TX have...",@SierraClub,0.166667,0.333333


## Tokenizer

In [17]:
data['processed_text'] = data.text.apply(spacy_tokenizer)

In [18]:
data.head()

Unnamed: 0,id,created_at,text,brand,polarity_score,subjectivity_score,processed_text
0,1534282318311051264,2022-06-07 21:13:04+00:00,Global Impact's a podcast about amplifying voi...,@SierraClub,0.025,0.25,global impact podcast amplify voice global gra...
1,1534252609468518402,2022-06-07 19:15:01+00:00,"Important piece from @VFWHQ 👇👇👇\n\n""Nature-bas...",@SierraClub,0.257143,0.657143,important piece @vfwhq 👇 👇 👇 nature base progr...
2,1534246717301874688,2022-06-07 18:51:36+00:00,"Paid for by Sierra Club Independent Action, ht...",@SierraClub,0.05,0.1125,pay sierra club independent action authorize c...
3,1534218707739746304,2022-06-07 17:00:18+00:00,The attack on our democracy didn’t end on Janu...,@SierraClub,-0.3,0.4,attack democracy end january 6 2021 voter supp...
4,1534193123110289410,2022-06-07 15:18:39+00:00,"The communities in and around El Paso, TX have...",@SierraClub,0.166667,0.333333,community el paso tx work conserve castner ran...


In [19]:
data.to_csv('data_backup.csv', index = False)

## creating entities out of tokenized text

In [20]:
entities_table = pd.DataFrame(columns=['text', 'pos', 'id'])

for i in range(len(data)):
    tokens_table = token_parser(data.iloc[i].processed_text)  
    tokens_table['id'] = data.iloc[i].id  
    entities_table = entities_table.append(tokens_table)

entities_table = entities_table[-entities_table['pos'].str.contains('SPACE')] # removing spaces, punctuation
entities_table = entities_table[-entities_table['pos'].str.contains('PUNCT')]
entities_table.reset_index(drop = True, inplace = True)

entities_table.shape

(17949, 3)

In [21]:
entities_table.head()

Unnamed: 0,text,pos,id
0,update,VERB,1500215577817886725
1,tabletochki,PROPN,1500215577817886725
2,charity,PROPN,1500215577817886725
3,foundation,PROPN,1500215577817886725
4,20,NUM,1500215577817886725


In [22]:
entities_table.to_csv('data_entities.csv', index = False)

## Vectorizer to build matrix

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score 


In [21]:
#processed_tweets = pd.read_csv('analysis output v2.csv')
#processed_tweets.processed_text = processed_tweets.processed_text.astype('str')

In [36]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=0.01, max_df=0.9, stop_words='english', 
                             lowercase=True, token_pattern='[a-z0-9]{2,}')  

# reference to using min and max df arguments
# https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer

# token pattern captures any non-whitespace character, e.g. hashtags starting with numbers

data_vectorized = vectorizer.fit_transform(data["processed_text"])

In [37]:
#https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#
    
n_components = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for i in n_components:
    clusterer = KMeans(n_clusters=i, random_state = 37)
    cluster_labels = clusterer.fit_predict(data_vectorized)

    silhouette_avg = silhouette_score(data_vectorized, cluster_labels)
    print("For number of topics: ", i,
          "the average silhouette score is: ", silhouette_avg)
    


For number of topics:  2 the average silhouette score is:  0.10689090974783871
For number of topics:  3 the average silhouette score is:  0.09491649595794838
For number of topics:  4 the average silhouette score is:  0.08518945906743987
For number of topics:  5 the average silhouette score is:  0.09109741083628307
For number of topics:  6 the average silhouette score is:  0.05747394751436125
For number of topics:  7 the average silhouette score is:  0.0902741397796428
For number of topics:  8 the average silhouette score is:  0.05680386138054413
For number of topics:  9 the average silhouette score is:  0.07148534603886826
For number of topics:  10 the average silhouette score is:  -0.002104684080951529
For number of topics:  11 the average silhouette score is:  0.038597881108507634


# Export matrix to adjacency list

In [38]:
# calling word matrix to dataframe function
features = vectorizer.get_feature_names()
len(features)

275

In [39]:
adjacency_matrix = wordmatrix_to_dataframe(data_vectorized, features)
documents_ids = data['id']
adjacency_matrix.index = documents_ids

In [40]:
adjacency_matrix

Unnamed: 0_level_0,000,10,100,2021,50,access,act,action,activist,address,...,water,way,week,wildlife,win,woman,work,worker,world,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1534282318311051264,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1534252609468518402,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1534246717301874688,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1534218707739746304,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1534193123110289410,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395141735274135552,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1395099463069360139,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1395085616828895234,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1395063475966185476,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
adjacency_matrix.to_csv('adjacency_matrix.csv', index=True)

In [42]:
adjacency_list = create_adjacency_list(adjacency_matrix)
adjacency_list

Unnamed: 0,id,target
0,1534282318311051264,change
1,1534282318311051264,climate
2,1534282318311051264,far
3,1534282318311051264,global
4,1534282318311051264,impact
...,...,...
6,1395052880063762438,good
7,1395052880063762438,help
8,1395052880063762438,infrastructure
9,1395052880063762438,job


In [43]:
adjacency_list.rename(columns={'id':'Source','target':'Target'}, inplace = True)

In [44]:
adjacency_list.to_csv('adjacency_list_v1.csv', index = False)

In [45]:
adjacency_list_for_merging = adjacency_list.rename(columns={'Source':'id','target':'Target'}).merge(data[['id','brand']], on = 'id')

In [46]:
adjacency_list_for_merging

Unnamed: 0,id,Target,brand
0,1534282318311051264,change,@SierraClub
1,1534282318311051264,climate,@SierraClub
2,1534282318311051264,far,@SierraClub
3,1534282318311051264,global,@SierraClub
4,1534282318311051264,impact,@SierraClub
...,...,...,...
153065,1395052880063762438,good,@NRDC
153066,1395052880063762438,help,@NRDC
153067,1395052880063762438,infrastructure,@NRDC
153068,1395052880063762438,job,@NRDC


In [47]:
adjacency_list_for_merging.to_csv('adjacency_list_for_merging.csv', index = False)

# Community Detection in NetworkX to enable Gephi Visualization

In [48]:
import networkx as nx
import community as community_louvain

## Creating network and detecting communities

In [49]:
# https://networkx.org/documentation/stable/reference/generated/networkx.convert_matrix.from_pandas_edgelist.html
graph_from_edgelist = nx.from_pandas_edgelist(adjacency_list, source = 'Source', target = 'Target')

In [50]:
# https://python-louvain.readthedocs.io/en/latest/api.html
partition = community_louvain.best_partition(graph_from_edgelist)

In [51]:
# this returns a dictionary of nodes as keys and community assignment as values
partition

{1534282318311051264: 0,
 'change': 0,
 'climate': 0,
 'far': 1,
 'global': 2,
 'impact': 7,
 'leader': 4,
 1534252609468518402: 5,
 'base': 8,
 'environment': 5,
 'important': 1,
 'nature': 1,
 'program': 8,
 'want': 4,
 1534246717301874688: 0,
 'action': 0,
 1534218707739746304: 9,
 '2021': 10,
 'end': 0,
 'law': 3,
 'risk': 9,
 'state': 7,
 'year': 10,
 1534193123110289410: 1,
 'community': 7,
 'national': 6,
 'potus': 6,
 'stand': 1,
 'work': 4,
 1534187742598512641: 6,
 'activist': 4,
 'amp': 6,
 'big': 6,
 'invest': 0,
 'oil': 6,
 'stop': 0,
 'time': 0,
 1534183407135207425: 0,
 'crisis': 0,
 'demand': 8,
 'join': 4,
 'partner': 7,
 1534160758338113536: 7,
 'low': 7,
 'people': 1,
 'policy': 8,
 'way': 1,
 1533883932550340609: 0,
 'come': 1,
 'court': 6,
 'dangerous': 9,
 'epa': 9,
 'future': 0,
 'mean': 1,
 1533866188115169281: 9,
 'air': 8,
 'check': 4,
 'decision': 6,
 'limit': 9,
 'month': 4,
 'plant': 8,
 'pollution': 8,
 'power': 0,
 'release': 10,
 1533838504874151945: 3,


In [52]:
community_columns = [value for value in partition.values()]

## Appending community to adjacency list

In [53]:
nodes_communities_df = pd.DataFrame(data=zip(list(graph_from_edgelist.nodes), community_columns), columns=['source','community'])

In [54]:
adjacency_list.rename(columns={'Source':'source'},inplace=True)
adjacency_list_with_communities = adjacency_list.merge(nodes_communities_df, how='left', on='source')

NameError: name 'nodes_and_communities_df' is not defined

In [55]:
adjacency_list_with_communities = pd.to_csv('adjacency list export with modularity class.csv', index=False)

AttributeError: module 'pandas' has no attribute 'to_csv'

In [None]:
# needed for appending edge attributes to the graph
attributes_dict = dict(list(zip(list(graph_from_edgelist.edges), adjacency_list_with_communities.community)))

## Appending attributes to graph and exporting

In [None]:
# https://networkx.org/documentation/stable/reference/generated/networkx.classes.function.set_edge_attributes.html

nx.set_edge_attributes(graph_from_edgelist, attributes_dict, "modularity class")
nx.set_node_attributes(graph_from_edgelist, partition, "modularity class")

In [None]:
nx.write_graphml(graph_from_edgelist, 'graph_for_viz.graphml')

pull this into gephi for visualization! 👆🏻

## Optional Step -- sample the adjacency list down to 1/10 size for viz (10 - 30k posts ideal)