# Init: Load Libraries and Functions

In [1]:
from collections import defaultdict 
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import string
from textblob import TextBlob  
import os
import re


#random seed for reproducibility
np.random.seed(67)

In [2]:
def polarity_scorer(input_text):
    """This function operates on a column in a data frame using apply().
    Takes a column as an input and returns a tuple of the polarity score and subjectivity score
    use .tolist() to split into separate columns, like here: https://stackoverflow.com/questions/29550414/how-to-split-column-of-tuples-in-pandas-dataframe"""
    text = TextBlob(input_text)
    polarity_score = text.sentiment.polarity
    subjectivity_score = text.sentiment.subjectivity
    
    return polarity_score, subjectivity_score

In [3]:
nlp = spacy.load('en_core_web_lg')
punctuations = string.punctuation #this is a python module which contains all the punctuations characters in English (and probably other languages too)
stopwords = list(STOP_WORDS)

def spacy_tokenizer(input_text):
    """removes stop words and punctuation from a document, converts all tokens to lower case
    and combines all tokens into one string.
    used in this example it appends a new column to a dataframe through apply()"""
    processed_text = re.sub(r"http\S+", '', input_text) # remove URLS, https://stackoverflow.com/questions/24399820/expression-to-remove-url-links-from-twitter-tweet
    mytokens = nlp(processed_text)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [4]:
# removing stopwords, it's this easy: https://medium.com/@makcedward/nlp-pipeline-stop-words-part-5-d6770df8a936

def token_parser(input_text):
    """This function creates a table with the text token and parts of a speech for a piece of text
    Builds one to many relationships by expanding all the tokens within a single document"""
    table = pd.DataFrame()
    doc = nlp(input_text)
    text = [token.text for token in doc if not token.is_stop]
    pos = [token.pos_ for token in doc if not token.is_stop]
    table['text'] = text
    table['pos'] = pos
    
    return table

In [5]:
def wordmatrix_to_dataframe(wordmatrix, feature_names):

    """This function takes a word matrix from SK-Learn and turns into a dataframe"""
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wordmatrix)]
    df = pd.DataFrame(data=wordmatrix.toarray(), #index = doc_names,
                     columns = feature_names)
    
    return df

In [6]:
def create_adjacency_list(adjacency_matrix):
    
    """This function takes a dataframe of  a dataframe"""
    table_out = pd.DataFrame(columns = ['id', 'target'])
    
    for i in range(len(adjacency_matrix)):

        filtered_table = pd.DataFrame(adjacency_matrix.iloc[i])
        filtered_table = filtered_table.transpose()
        filtered_table = filtered_table.loc[:, ~(filtered_table==0).any(axis=0)]
        
        
        #for column in filtered_table.columns:
        temp_table = pd.DataFrame(columns = ['id', 'target'])
        source = [filtered_table.index.values[0] for column in filtered_table.columns]
        target = [column for column in filtered_table.columns]
        
        temp_table['id'] = source
        temp_table['target'] = target
            
        table_out = table_out.append(temp_table)
 
    return table_out
    

# Load files and tokenize

In [7]:
path = 'exports/'
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path,f))]


In [8]:
files

['@ewg_tweets.csv',
 '@Earthjustice_tweets.csv',
 '@foe_us_tweets.csv',
 '@EnvDefenseFund_tweets.csv',
 '@NRDC_tweets.csv']

In [9]:
text_to_drop = '_tweets.csv'
all_files = pd.DataFrame(columns = ['id', 'created_at', 'text', 'brand'])

for file in files:
    each_file = pd.read_csv(path+file)
    each_file['brand'] = file.replace(text_to_drop, '') #https://www.journaldev.com/23674/python-remove-character-from-string
    all_files = all_files.append(each_file)
    
all_files.shape

(13619, 4)

In [10]:
#all_files.created_at = pd.to_datetime(all_files.created_at)

In [11]:
# all_files = all_files.loc[all_files.created_at > "2022-02-15"] use this cell if filterting by date

In [12]:
all_files.to_csv('data_backup.csv', index = False)

In [13]:
data = all_files.copy()

In [14]:
data.head()

Unnamed: 0,id,created_at,text,brand
0,1540683731630927874,2022-06-25 13:10:00+00:00,A new study in @nature shows that increasing t...,@ewg
1,1540366146423095296,2022-06-24 16:08:02+00:00,"No matter how you celebrate, we're wishing you...",@ewg
2,1540089567218470918,2022-06-23 21:49:00+00:00,Farmers and ranchers can take important steps ...,@ewg
3,1540063656586756100,2022-06-23 20:06:03+00:00,A handful of recent FDA decisions allowed seve...,@ewg
4,1540047792634224642,2022-06-23 19:03:01+00:00,Provisions to tackle #PFAS are included in the...,@ewg


In [15]:
data.brand.value_counts()

@foe_us            2875
@Earthjustice      2817
@EnvDefenseFund    2749
@NRDC              2670
@ewg               2508
Name: brand, dtype: int64

## Sentiment Analysis 

In [16]:
# calls functions through apply(), returns a tuple then splits the results into 2 columns
data.text = data.text.astype('str') 

data[['polarity_score', 'subjectivity_score']] = pd.DataFrame(data.text.apply(polarity_scorer).tolist(), index = data.index)

data.head()

Unnamed: 0,id,created_at,text,brand,polarity_score,subjectivity_score
0,1540683731630927874,2022-06-25 13:10:00+00:00,A new study in @nature shows that increasing t...,@ewg,0.136364,0.454545
1,1540366146423095296,2022-06-24 16:08:02+00:00,"No matter how you celebrate, we're wishing you...",@ewg,1.0,1.0
2,1540089567218470918,2022-06-23 21:49:00+00:00,Farmers and ranchers can take important steps ...,@ewg,0.6,0.575
3,1540063656586756100,2022-06-23 20:06:03+00:00,A handful of recent FDA decisions allowed seve...,@ewg,0.2,0.33
4,1540047792634224642,2022-06-23 19:03:01+00:00,Provisions to tackle #PFAS are included in the...,@ewg,0.1375,0.4875


## Tokenizer

In [17]:
data['processed_text'] = data.text.apply(spacy_tokenizer)

In [18]:
data.head()

Unnamed: 0,id,created_at,text,brand,polarity_score,subjectivity_score,processed_text
0,1540683731630927874,2022-06-25 13:10:00+00:00,A new study in @nature shows that increasing t...,@ewg,0.136364,0.454545,new study @nature increase sustainability scho...
1,1540366146423095296,2022-06-24 16:08:02+00:00,"No matter how you celebrate, we're wishing you...",@ewg,1.0,1.0,matter celebrate wish wonderful independence d...
2,1540089567218470918,2022-06-23 21:49:00+00:00,Farmers and ranchers can take important steps ...,@ewg,0.6,0.575,farmer rancher important step lower emission h...
3,1540063656586756100,2022-06-23 20:06:03+00:00,A handful of recent FDA decisions allowed seve...,@ewg,0.2,0.33,handful recent fda decision allow type phthala...
4,1540047792634224642,2022-06-23 19:03:01+00:00,Provisions to tackle #PFAS are included in the...,@ewg,0.1375,0.4875,provision tackle pfa include ndaa fy 2023 repr...


In [19]:
data.to_csv('data_backup.csv', index = False)

## creating entities out of tokenized text

In [20]:
entities_table = pd.DataFrame(columns=['text', 'pos', 'id'])

for i in range(len(data)):
    tokens_table = token_parser(data.iloc[i].processed_text)  
    tokens_table['id'] = data.iloc[i].id  
    entities_table = entities_table.append(tokens_table)

entities_table = entities_table[-entities_table['pos'].str.contains('SPACE')] # removing spaces, punctuation
entities_table = entities_table[-entities_table['pos'].str.contains('PUNCT')]
entities_table.reset_index(drop = True, inplace = True)

entities_table.shape

(17949, 3)

In [21]:
entities_table.head()

Unnamed: 0,text,pos,id
0,update,VERB,1500215577817886725
1,tabletochki,PROPN,1500215577817886725
2,charity,PROPN,1500215577817886725
3,foundation,PROPN,1500215577817886725
4,20,NUM,1500215577817886725


In [22]:
entities_table.to_csv('data_entities.csv', index = False)

## Vectorizer to build matrix

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score 


In [21]:
#processed_tweets = pd.read_csv('analysis output v2.csv')
#processed_tweets.processed_text = processed_tweets.processed_text.astype('str')

In [28]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=0.015, max_df=0.9, stop_words='english', 
                             lowercase=True, token_pattern='[a-z0-9]{2,}')  

# reference to using min and max df arguments
# https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer

# token pattern captures any non-whitespace character, e.g. hashtags starting with numbers

data_vectorized = vectorizer.fit_transform(data["processed_text"])

In [29]:
#https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#
    
n_components = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

for i in n_components:
    clusterer = KMeans(n_clusters=i, random_state = 37)
    cluster_labels = clusterer.fit_predict(data_vectorized)

    silhouette_avg = silhouette_score(data_vectorized, cluster_labels)
    print("For number of topics: ", i,
          "the average silhouette score is: ", silhouette_avg)
    


For number of topics:  2 the average silhouette score is:  0.06950906861393022
For number of topics:  3 the average silhouette score is:  0.0668946513643088
For number of topics:  4 the average silhouette score is:  0.06275139090648188
For number of topics:  5 the average silhouette score is:  0.061699755948167324
For number of topics:  6 the average silhouette score is:  0.05898155116996679
For number of topics:  7 the average silhouette score is:  0.06259873541185147
For number of topics:  8 the average silhouette score is:  0.05853989049744102
For number of topics:  9 the average silhouette score is:  0.06332041106247181
For number of topics:  10 the average silhouette score is:  0.04752110162757471
For number of topics:  11 the average silhouette score is:  0.04665362862881646


# Export matrix to adjacency list

In [30]:
# calling word matrix to dataframe function
features = vectorizer.get_feature_names()
len(features)

194

In [31]:
adjacency_matrix = wordmatrix_to_dataframe(data_vectorized, features)
documents_ids = data['id']
adjacency_matrix.index = documents_ids

In [32]:
adjacency_matrix

Unnamed: 0_level_0,000,2021,act,action,address,administration,air,allow,amp,bad,...,warming,waste,water,way,week,wildlife,win,work,world,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1540683731630927874,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1540366146423095296,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1540089567218470918,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1540063656586756100,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1540047792634224642,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1397991062170390531,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1397975970792054787,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1397962841622122504,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1397962840724496386,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
adjacency_matrix.to_csv('adjacency_matrix.csv', index=True)

In [34]:
adjacency_list = create_adjacency_list(adjacency_matrix)
adjacency_list

Unnamed: 0,id,target
0,1540683731630927874,increase
1,1540683731630927874,new
2,1540683731630927874,reduce
3,1540683731630927874,study
0,1540366146423095296,check
...,...,...
1,1397962840724496386,human
2,1397962840724496386,learn
0,1397962839894085633,000
1,1397962839894085633,new


In [35]:
adjacency_list.rename(columns={'id':'Source','target':'Target'}, inplace = True)

In [36]:
adjacency_list.to_csv('adjacency_list_v1.csv', index = False)

In [37]:
adjacency_list_for_merging = adjacency_list.rename(columns={'Source':'id','target':'Target'}).merge(data[['id','brand']], on = 'id')

In [38]:
adjacency_list_for_merging

Unnamed: 0,id,Target,brand
0,1540683731630927874,increase,@ewg
1,1540683731630927874,new,@ewg
2,1540683731630927874,reduce,@ewg
3,1540683731630927874,study,@ewg
4,1540366146423095296,check,@ewg
...,...,...,...
85083,1397962840724496386,human,@NRDC
85084,1397962840724496386,learn,@NRDC
85085,1397962839894085633,000,@NRDC
85086,1397962839894085633,new,@NRDC


In [39]:
adjacency_list_for_merging.to_csv('adjacency_list_for_merging.csv', index = False)

# Community Detection in NetworkX to enable Gephi Visualization

In [40]:
import networkx as nx
import community as community_louvain

## Creating network and detecting communities

In [41]:
# https://networkx.org/documentation/stable/reference/generated/networkx.convert_matrix.from_pandas_edgelist.html
graph_from_edgelist = nx.from_pandas_edgelist(adjacency_list, source = 'Source', target = 'Target')

In [42]:
# https://python-louvain.readthedocs.io/en/latest/api.html
partition = community_louvain.best_partition(graph_from_edgelist)

In [43]:
# this returns a dictionary of nodes as keys and community assignment as values
partition

{1540683731630927874: 0,
 'increase': 0,
 'new': 0,
 'reduce': 0,
 'study': 0,
 1540366146423095296: 1,
 'check': 7,
 'day': 1,
 'help': 1,
 'safe': 3,
 1540089567218470918: 0,
 'carbon': 0,
 'emission': 0,
 'farmer': 4,
 'food': 3,
 'good': 7,
 'important': 5,
 'industry': 6,
 'plant': 6,
 'step': 7,
 1540063656586756100: 3,
 'allow': 3,
 'fda': 3,
 'late': 0,
 1540047792634224642: 7,
 'address': 0,
 'community': 4,
 'exposure': 3,
 'family': 3,
 'foreverchemical': 3,
 'include': 3,
 1540020623581347840: 3,
 'harm': 3,
 'harmful': 5,
 'know': 3,
 'pesticide': 3,
 1539990169062936577: 1,
 1539713088261431297: 7,
 'environment': 5,
 'like': 4,
 'news': 7,
 1539686663575801858: 7,
 'come': 1,
 1539620733017624576: 3,
 'chemical': 3,
 'people': 4,
 'pfas': 3,
 'protect': 3,
 'toxic': 3,
 'water': 3,
 1539349945223888901: 3,
 'climate': 0,
 'impact': 0,
 'land': 2,
 'low': 4,
 'use': 3,
 1539333838173876225: 3,
 'high': 1,
 'level': 3,
 1539274956088942597: 0,
 'report': 0,
 15378491578928

In [44]:
community_columns = [value for value in partition.values()]

## Appending community to adjacency list

In [45]:
nodes_communities_df = pd.DataFrame(data=zip(list(graph_from_edgelist.nodes), community_columns), columns=['source','community'])

In [46]:
adjacency_list.rename(columns={'Source':'source'},inplace=True)
adjacency_list_with_communities = adjacency_list.merge(nodes_communities_df, how='left', on='source')

In [48]:
adjacency_list_with_communities.to_csv('adjacency list export with modularity class.csv', index=False)

In [49]:
# needed for appending edge attributes to the graph
attributes_dict = dict(list(zip(list(graph_from_edgelist.edges), adjacency_list_with_communities.community)))

## Appending attributes to graph and exporting

In [50]:
# https://networkx.org/documentation/stable/reference/generated/networkx.classes.function.set_edge_attributes.html

nx.set_edge_attributes(graph_from_edgelist, attributes_dict, "modularity class")
nx.set_node_attributes(graph_from_edgelist, partition, "modularity class")

In [51]:
nx.write_graphml(graph_from_edgelist, 'graph_for_viz.graphml')

pull this into gephi for visualization! 👆🏻

## Optional Step -- sample the adjacency list down to 1/10 size for viz (10 - 30k posts ideal)