# Making the Graphml File

## Taking a Twitter dataset and creating the graph file which we will insert into graphia for the visualisation

In [1]:
#import libraries
import pandas as pd
import numpy as np
import networkx as nx
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import string
import random
import re
from emoji import UNICODE_EMOJI
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm


In [None]:
#First we load in the dataset and inspect it
data = pd.read_csv("dataset/gotTwitter.csv")

#Dataset has way too many columns that we don't care about
print(data.head())

#Lets filter out what we don't need and save again
data = data[['created_at', 'screen_name', 'text', 'favorite_count', 'retweet_count', 'is_retweet', 'hashtags', 'mentions_screen_name', 'lang', 'country', 'location', 'followers_count', 'friends_count', 'verified']]

data.to_csv("dataset/thrones_filtered.csv")

In [None]:
#Now working with the new data since it loads faster
data = pd.read_csv("dataset/thrones_filtered.csv")

#Can also remove rows without mentions
data = data.dropna(subset=['mentions_screen_name'])
data.reset_index(drop = True, inplace = True)

In [45]:
#Function for removing emojis
def remove_emojis(text):
    return emoji.get_emoji_regexp().sub(r'', text)

#Function for preparing the text for SeaNMF topic modeling
def topic_modeling_preprocessing(tweet):

    #First remove mentions and hashtags
    tweet_list = tweet.split()
    tweet_list_filtered = [word for word in tweet_list if word[0]!='@' and word[0]!='#' and "https" not in word]

    #Remove emojis
    filtered_tweet = remove_emojis(" ".join(tweet_list_filtered))

    #first remove punctuation
    tweet_no_punct = filtered_tweet.translate(str.maketrans('', '', string.punctuation)).lower()

    #Now can lemmatize and POS tag the words and remove closed class words
    tweet_pos = nltk.pos_tag(tweet_no_punct.split())
    
    #define list of closed classes
    #Keep digits for now as would be nice to see season progression details
    closed_classes = ['CC', 'DT', 'IN', 'PRP', 'PRP$', 'WDT', 'WP', 'WP$', "LS", "MD", "PDT"]
    stopword_list = stopwords.words("english")

    open_class_tweet = [pos_tuple[0] for pos_tuple in tweet_pos if pos_tuple[1] not in closed_classes and pos_tuple[0] not in stopword_list]

    #now final step to lemmatize all the words in this list
    #initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_tweet = [lemmatizer.lemmatize(word) for word in open_class_tweet]

    return " ".join(lemmatized_tweet)


#Lets set up some empty lists where we will store the extracted data which will then be made in to the DF
tweets = []
sources = []
targets = []
favorite_counts = []
retweet_count = []
hashtags = []
lang = []
location = []
followers_count = []
friends_count = []
verified = []
sentiment_scores = []
text_for_topic_modeling = []

#Initialize VADER sentiment score analyzer
sid_obj = SentimentIntensityAnalyzer()

#Iterate over our original dataframe
for index in range(len(data)):

    #Get the mentions as a list for each tweet
    mentions = data['mentions_screen_name'][index]
    mentions_list = mentions.split()

    #Now we neet to iterate over this list and add information to lists accordingly
    #Let's also add the VADER sentiment score
    for mention in mentions_list:
        tweets.append(data['text'][index])
        sources.append(data['screen_name'][index])
        targets.append(mention)
        favorite_counts.append(data['favorite_count'][index])
        retweet_count.append(data['retweet_count'][index])
        hashtags.append(data['hashtags'][index])
        lang.append(data['lang'][index])
        location.append(data['location'][index])
        followers_count.append(data['followers_count'][index])
        friends_count.append(data['friends_count'][index])
        verified.append(data['verified'][index])
        sentiment_scores.append(sid_obj.polarity_scores(data['text'][index])['compound'])
        text_for_topic_modeling.append(topic_modeling_preprocessing(data['text'][index]))

#Make final mentions DF from which we will get graphml file
mentions_df = pd.DataFrame({
        'tweets' : tweets,
        'sources' : sources,
        'targets' : targets,
        'favorite_counts' : favorite_counts,
        'hashtags' : hashtags,
        'lang' : lang,
        'location':location,
        'followers_count':followers_count,
        'friends_count':friends_count,
        'verified':verified,
        'sentiment_scores': sentiment_scores,
        'topic_modeling_text' : text_for_topic_modeling,
    })

mentions_df.to_csv("dataset/mentions.csv")
    

In [2]:
#Lets run some topic modeling too so we can attach topic labels to our graph
mentions_df = pd.read_csv("dataset/mentions.csv")

#Now need to make the text file for SeaNMF
#Because you have changed the data and expanded on mentions, lots of text duplicates to get rid of
text_list = set(mentions_df['topic_modeling_text'].to_list())

with open("topic_modeling.txt", 'w') as f:

    for tweet in text_list:
        if not isinstance(tweet, float):
            f.write(tweet + "\n")
    
    f.close()

In [None]:
#Now need to assign the topic number and at same time topic label to our data
#Load in some of the SeaNMF matrices and structures

#Function for loading in the vocab (from SeaNMF)
def read_vocab(file_name):
    print('read vocabulary')
    print('-'*50)
    vocab = []
    fp = open(file_name, 'r')
    for line in fp:
        arr = re.split('\s', line[:-1])
        vocab.append(arr[0])
    fp.close()

    return vocab

#load in the vocab file
vocab = read_vocab('SeaNMF/data/vocab.txt')

#Load in the model results (W Matrix)
W = np.loadtxt('SeaNMF/seanmf_results/60/W.txt', dtype=float)

#Load in the topics and the topic names and make the mapping dict
topic_names = pd.read_csv("dataset/topic_word_lists.csv")
map_topics = topic_names[["Topic Name", "Topic Number"]]

mapping_dict = dict(zip(map_topics['Topic Number'], map_topics['Topic Name']))

#Function to get the most likely topic for a given tweet, using the resulting matrices and vocab file
def get_most_likely_topic(tweet, vocab, W, mapping_dict):

    #Tokenize the string
    if not isinstance(tweet, float):
        tokenized_string = tweet.split()

        #Get the indexes from vocab list for the new string
        vocab_indexes = []
        for token in tokenized_string:
            if token in vocab:
                vocab_indexes.append(vocab.index(token))

        #Using the indexes, subset W so it just contains all those rows
        subset_W = W[vocab_indexes, :]
        

        #Get a sum of the columns (divide by length of columns) to normalize or something but not sure about this yet
        sums_of_columns = []
        if subset_W.size != 0:
            for column in subset_W.T:
                sums_of_columns.append(sum(column)/subset_W.shape[1])
        else:
            return "not found"
        #Get the indexes of top 10 topics of the document
        top_topics = sorted(range(len(sums_of_columns)), key=lambda i: sums_of_columns[i])[-15:]
        top_topics.reverse()


        #Filter out these topics (MISC, TV Service)
        list_filter_topics = [4, 3, 5, 2, 1, 38, 45, 26, 25, 29, 11]
        
        #Set up the top topic
        i = 0
        #Check if top topic is in list, if it is, increment and take the next one
        while (top_topics[i] + 1) in list_filter_topics:
            i = i + 1

        #Return the value at the dict key
        return mapping_dict[top_topics[i]+1]
    else:
        return "not found"


tqdm.pandas()

#Apply to the topic modelling text for each mention pair
mentions_df['Highest_topic'] = mentions_df.topic_modeling_text.progress_apply(lambda x: get_most_likely_topic(x, vocab, W, mapping_dict))
mentions_df.to_csv("dataset/final_df.csv")

In [4]:
#Now let's make the graph
def make_graph_file(mentions_data):

    #initialize the graph
    graph = nx.Graph()

    #Make the graph, this is where later we will add the attributes too
    for index, relation in mentions_data.iterrows():
        graph.add_edge(relation["sources"], relation["targets"])
        graph.add_node(relation["sources"], topic = relation['Highest_topic'], polarity = relation['sentiment_scores'], followers = relation["followers_count"], friends = relation["friends_count"], likes = relation["favorite_counts"])
        graph.add_node(relation["targets"])
    
    #Now lets get the largest component subgraph
    components = sorted(nx.connected_components(graph), key = len, reverse = True)
    largest_component = graph.subgraph(components[0])

    #Save the graphml file
    nx.write_graphml(largest_component, "dataset/final_graph.graphml")


make_graph_file(mentions_df)

