# Victim Blaming Project 

# Data collection, data preprocessing and network building

#### Author: Carmen Martin Turrero

In [None]:
#imports
import os
import requests 
import pandas as pd 
import time
import itertools
import networkx as nx
import igraph as ig
import ast
import cairocffi as cairo
import random
import math
import matplotlib.pyplot as plt

# import our functions
import download_utils
import clean_utils

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = '/content/drive/MyDrive/TwitterData/'

## Download tweets

In [None]:
# Choose token
os.environ['TOKEN'] = "..."

In [None]:
# Headers
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

In [None]:
headers = create_headers(os.environ['TOKEN'])

In [None]:
# Set query
lang = 'lang:en'
key_hashtags = '#metoo OR #victimblaming OR #rapevictim OR #beingrapedneverreported OR #streetharassment OR #rapeculture OR #slutshame OR #rape OR (#abuse -sibling) OR #sexualabuse OR #harassment OR (#survivor abuse) OR (#survivor rape) OR #sexualharassment'
questions = '"did you try to stop it" OR "He ought to have enjoyed it" OR "Did you yell" OR "Do you have proofs" OR "What were you wearing" OR "flirting with him" OR "now after so many years"'
not_wanted = '-#porn -#horny -#cancer -#breastcancer -#bullying '
tweet_specifications = '-has:links -is:retweet  -has:media -has:images -has:video_link '
geolocation = '(place_country:US OR place_country:GB) '
query_text = lang + ' (' + key_hashtags + ' OR ' + questions + ') ' + not_wanted + tweet_specifications #+ geolocation 
query_text

In [None]:
#endpoint
endpoint = "https://api.twitter.com/2/tweets/search/all/"

# Max results per request
max_results = 500

In [None]:
# Total number of tweets for the period of time
# Set period of time
start_time = "2017-07-01T00:00:00Z"
end_time = "2017-10-01T00:00:00Z"
results, _, count = get_data_counts(query_text, start_time, end_time, "", "https://api.twitter.com/2/tweets/counts/all")
print('Tweets in 2017:', results)

# Set period of time
start_time = "2018-07-01T00:00:00Z"
end_time = "2018-10-01T00:00:00Z"
results, _, count = get_data_counts(query_text, start_time, end_time, "", "https://api.twitter.com/2/tweets/counts/all")
print('Tweets in 2018:', results)

# Set period of time
start_time = "2022-07-01T00:00:00Z"
end_time = "2022-10-01T00:00:00Z"
results, _, count = get_data_counts(query_text, start_time, end_time, "", "https://api.twitter.com/2/tweets/counts/all")
print('Tweets in 2022:', results)

In [None]:
# Set period of time and download tweets
start_time = "2017-07-01T00:00:00Z"
end_time = "2017-10-01T00:00:00Z"
tweets = get_data(query_text, start_time=start_time, end_time=end_time, 
                  max_results=max_results, expansions='author_id,in_reply_to_user_id,geo.place_id',
                  tweet_fields='id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source,entities',
                  user_fields='id,name,username,created_at,description,public_metrics,verified',
                  place_fields='full_name,id,country,country_code,geo,name,place_type',
                  endpoint= endpoint,
                  next_token='')

In [None]:
# save
df_tweets = pd.DataFrame(tweets)
df_tweets.to_pickle(path+"tweets2017.pkl")

## Sample 10% of tweets

In [None]:
tweets_df = pd.read_pickle(path+"tweets2022.pkl")
sampled_tweets = tweets_df.sample(frac=0.1, replace=False, random_state=14538, axis=0, ignore_index=True)
sampled_tweets.to_pickle(path+"sampled_tweets2022.pkl")

## Extract clean text from tweets

In [None]:
mocktext = 'Hey Anna!! Have u heard???? @alessio_123 "has been" #reported *for #abuse-d & misconduct https://es.wikipedia.org (1)'
print('Mock tweet')
print(mocktext)
print('Clean text')
print(tweet2text(mocktext))

In [None]:
# Apply to our data
#Load tweets
tweets_df = pd.read_pickle(path+"sampled_tweets2022.pkl")

# Create df to store results and without irrelevant data
tweets_filtered = tweets_df.copy() 
column_list = ["id", "text"]
tweets_filtered = tweets_filtered[column_list]

# Clean text
tweets_filtered["clean_text"] = tweets_filtered["text"].map(tweet2text)
tweets_filtered.loc[tweets_filtered["clean_text"].isnull(),"clean_text"] = ""

# Save
tweets_filtered.to_csv(path+"clean_tweets2022.csv")

## Build networks - nodes and edges

In [None]:
#Load tweets
tweets_df = pd.read_pickle(path+"sampled_tweets2022.pkl")

In [None]:
tweets["extra_clean_text"] = tweets["text"].map(cleaner)

In [None]:
tweet_tokenizer = nltk.TweetTokenizer()

#initialize an empty dict
unique_words = {}

for idx, row in tweets.iterrows():
    if row["extra_clean_text"] != "":
        for word in tweet_tokenizer.tokenize(row["extra_clean_text"]):
            unique_words.setdefault(word,0)
            unique_words[word] += 1

### Words + emotions

In [None]:
tweet_tokenizer = nltk.TweetTokenizer()

#initialize an empty dict
unique_words = {}
words_anger = {}
words_sad = {}
words_tone = {}
words_posemo = {}
words_negemo = {}
words_anx = {}
like_count = {}
tweets_and_likes = 0

for idx, row in tweets.iterrows():
    if row["extra_clean_text"] != "":
        tweet_likes = tweets_data['public_metrics'][2]['like_count'] +1 # we add the +1 for the tweet itself
        tweets_and_likes += tweet_likes # will be used for averaging
        for word in tweet_tokenizer.tokenize(row["extra_clean_text"]):
            # if word hasnt appeared before, create a key-value pair
            unique_words.setdefault(word,0)
            words_anger.setdefault(word,0)
            words_sad.setdefault(word,0)
            words_tone.setdefault(word,0)
            words_posemo.setdefault(word,0)
            words_negemo.setdefault(word,0)
            words_anx.setdefault(word,0)
            like_count.setdefault(word,0)
            # Count words appearence
            unique_words[word] += 1
            # Take into account likes and add the emotions
            words_anger[word] += tweet_likes*row['anger']
            words_sad[word] += tweet_likes*row['sad']
            words_tone[word] += tweet_likes*row['Tone']
            words_posemo[word] += tweet_likes*row['posemo']
            words_negemo[word] += tweet_likes*row['negemo']
            words_anx[word] += tweet_likes*row['anx']
            like_count[word] += tweet_likes

In [None]:
# Turn to dfs
uw_df = pd.DataFrame.from_dict(unique_words, orient='index').reset_index()
anger_df = pd.DataFrame.from_dict(words_anger, orient='index').reset_index()
sad_df = pd.DataFrame.from_dict(words_sad, orient='index').reset_index()
posemo_df = pd.DataFrame.from_dict(words_posemo, orient='index').reset_index()
negemo_df = pd.DataFrame.from_dict(words_negemo, orient='index').reset_index()
anx_df = pd.DataFrame.from_dict(words_anx, orient='index').reset_index()
tone_df = pd.DataFrame.from_dict(words_tone, orient='index').reset_index()
likes_df = pd.DataFrame.from_dict(like_count, orient='index').reset_index()
# Rename columns
uw_df.rename(columns = {'index':'Word', 0:'count'}, inplace=True)
anger_df.rename(columns = {'index':'Word', 0:'anger'}, inplace=True)
sad_df.rename(columns = {'index':'Word', 0:'sad'}, inplace=True)
posemo_df.rename(columns = {'index':'Word', 0:'posemo'}, inplace=True)
negemo_df.rename(columns = {'index':'Word', 0:'negemo'}, inplace=True)
anx_df.rename(columns = {'index':'Word', 0:'anx'}, inplace=True)
tone_df.rename(columns = {'index':'Word', 0:'tone'}, inplace=True)
likes_df.rename(columns = {'index':'Word', 0:'total_counts'}, inplace=True) # tweets+favs

# Combine to one dataframe
words_df = uw_df.join(anger_df.set_index('Word'), on='Word')
words_df = words_df.join(sad_df.set_index('Word'), on='Word')
words_df = words_df.join(anx_df.set_index('Word'), on='Word')
words_df = words_df.join(posemo_df.set_index('Word'), on='Word')
words_df = words_df.join(negemo_df.set_index('Word'), on='Word')
words_df = words_df.join(tone_df.set_index('Word'), on='Word')
words_df = words_df.join(likes_df.set_index('Word'), on='Word')
words_df.sort_values(by=['count'], ascending=False, inplace=True)
words_df = words_df.reset_index().drop(columns=["index"])
words_df['anger'] = words_df['anger']/words_df['total_counts']
words_df['sad'] = words_df['sad']/words_df['total_counts']
words_df['posemo'] = words_df['posemo']/words_df['total_counts']
words_df['negemo'] = words_df['negemo']/words_df['total_counts']
words_df['anx'] = words_df['anx']/words_df['total_counts']
words_df['tone'] = words_df['tone']/words_df['total_counts']
words_df.head(20)

In [None]:
words_df['emotion'] = words_df[['posemo', 'negemo']].idxmax(axis=1)
words_df['negative_emotion'] = words_df[['anger', 'sad', 'anx']].idxmax(axis=1)
words_df['main_emotion'] = np.where(words_df['emotion'] == 'negemo', words_df['negative_emotion'], 'posemo')
words_df

In [None]:
emotions_df = words_df[['Word', 'count', 'main_emotion']]
emotions_df

In [None]:
# Save
uw_df.to_csv(path+"words_2022.csv")
words_df.to_csv(path+"words_emotionvalues2022.csv")
emotions_df.to_csv(path+"words_mainemotion2022.csv")

In [None]:
uw = unique_words.keys()

In [None]:
network = {}
network_key = 0
for index, row in tweets.iterrows():
    combined_list = [word for word in str.split(row["clean_text"], " ") if word in uw]
    #itertool product creates Cartesian product of each element in the combined list
    for pair in itertools.product(combined_list, combined_list):
        #exclude self-loops and count each pair only once because our graph is undirected and we do not take self-loops into account
        if pair[0]!=pair[1] and not(pair[::-1] in network):
            network.setdefault(pair,0)
            network[pair] += 1 
    
network_df = pd.DataFrame.from_dict(network, orient="index")

In [None]:
network_df.reset_index(inplace=True)
network_df.columns = ["pair","weight"]
network_df.sort_values(by="weight",inplace=True, ascending=False)
network_df.head(20)

In [None]:
#to get weighted graph we need a list of 3-element tuplels (u,v,w) where u and v are nodes and w is a number representing weight
up_weighted = []
for edge in network:
    #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
    #if(network[edge])>1:
    up_weighted.append((edge[0],edge[1],network[edge]))

G = nx.Graph()
G.add_weighted_edges_from(up_weighted)

In [None]:
print(len(G.nodes()))
print(len(G.edges()))

In [None]:
filename = path+"edgelist_words_2022.csv"
nx.write_weighted_edgelist(G, filename, delimiter=",")

In [None]:
word_nodes = pd.DataFrame.from_dict(unique_words,orient="index")
word_nodes.reset_index(inplace=True)
word_nodes["Label"] = word_nodes["index"]
word_nodes.rename(columns={"index":"Id",0:"delete"},inplace=True)
word_nodes = word_nodes.drop(columns=['delete'])

word_nodes

In [None]:
word_nodes.to_csv(path+"nodelist_words_2022.csv",index=False)

### Hashtags

In [None]:
tweets.loc[tweets["entities"].isnull(), "entities"] = None

In [None]:
tweets["hashtags"] = ""

In [None]:
unique_hashtags = {}
index = 0

for idx, row in tweets.iterrows():
    if row["entities"] is not None and "hashtags" in row["entities"]:
        hl = []
        for hashtag in row["entities"]["hashtags"]:
            tag = hashtag["tag"].lower()
            unique_hashtags.setdefault(tag, 0)
            unique_hashtags[tag] += 1
            hl.append(tag)
 
        tweets.at[idx,"hashtags"] = hl

In [None]:
unique_hashtags = dict(sorted(unique_hashtags.items(), key=lambda item: item[1], reverse=True))

In [None]:
uh_df = pd.DataFrame.from_dict(unique_hashtags, orient='index').reset_index()
uh_df.rename(columns = {'index':'Hashtag', 0:'Count'}, inplace=True)

In [None]:
uh_df[0:20]

In [None]:
# save
uh_df.to_csv(path+"hashtags_2022.csv")

In [None]:
uh = unique_hashtags.keys()

In [None]:
network = {}
network_key = 0
for index, row in tweets.iterrows():
    combined_list = [hashtag for hashtag in row["hashtags"]]
    #itertool product creates Cartesian product of each element in the combined list
    for pair in itertools.product(combined_list, combined_list):
        #exclude self-loops and count each pair only once because our graph is undirected and we do not take self-loops into account
        if pair[0]!=pair[1] and not(pair[::-1] in network):
            network.setdefault(pair,0)
            network[pair] += 1 
    
network_df = pd.DataFrame.from_dict(network, orient="index")

In [None]:
network_df.reset_index(inplace=True)
network_df.columns = ["pair","weight"]
network_df.sort_values(by="weight",inplace=True, ascending=False)
network_df.head(20)

In [None]:
#to get weighted graph we need a list of 3-element tuplels (u,v,w) where u and v are nodes and w is a number representing weight
up_weighted = []
for edge in network:
    #we can filter edges by weight by uncommenting the next line and setting desired weight threshold
    #if(network[edge])>1:
    up_weighted.append((edge[0],edge[1],network[edge]))

G = nx.Graph()
G.add_weighted_edges_from(up_weighted)

In [None]:
print(len(G.nodes()))
print(len(G.edges()))

In [None]:
filename = path+"edgelist_hashtags_2022.csv"
nx.write_weighted_edgelist(G, filename, delimiter=",")

In [None]:
hashtag_nodes = uh_df.copy()
hashtag_nodes["Label"] = hashtag_nodes["Hashtag"]
hashtag_nodes.rename(columns={"Hashtag":"Id"},inplace=True)
hashtag_nodes = hashtag_nodes.drop(columns=['Count'])
hashtag_nodes

In [None]:
hashtag_nodes.to_csv(path+"nodelist_hashtags_2022.csv",index=False)

## Extracting main co-occurrences with the word victim

In [None]:
words_2017 = pd.read_csv('edgelist_words_2017.csv', names=['word1', 'word2', 'count'])
victim_2017 = words_2017[words_2017.apply(lambda x: x.astype(str).str.contains('victim').any(), axis=1)]
#victim_2017 = victim_2017.sort_values('count', axis=0, ascending=False)

In [None]:
words_2018 = pd.read_csv('edgelist_words_2018.csv', names=['word1', 'word2', 'count'])
victim_2018 = words_2018[words_2018.apply(lambda x: x.astype(str).str.contains('victim').any(), axis=1)]
#victim_2018 = victim_2018.sort_values('count', axis=0, ascending=False)

In [None]:
words_2022 = pd.read_csv('edgelist_words_2022.csv', names=['word1', 'word2', 'count'])
victim_2022 = words_2022[words_2022.apply(lambda x: x.astype(str).str.contains('victim').any(), axis=1)]
#victim_2022 = victim_2022.sort_values('count', axis=0, ascending=False)

In [None]:
def create_related(df):
    col1 = df[df.apply(lambda x: x['word1'] != 'victim', axis=1)][['word1', 'count']]
    col1 = col1.rename(columns={'word1': 'word'})
    col2 = df[df.apply(lambda x: x['word2'] != 'victim', axis=1)][['word2', 'count']]
    col2 = col2.rename(columns={'word2': 'word'})
    result = pd.concat([col1,col2])
    result = result.groupby('word', as_index=False).max()
    return result.sort_values('count', axis=0, ascending=False)

In [None]:
relatedwords17.head(5)

In [None]:
relatedwords18.head(5)

In [None]:
relatedwords22.head(5)

In [None]:
df1 = relatedwords17
df2 = relatedwords18
df3 = relatedwords22

# Get the top 5 words for each dataframe
df1_top_5 = df1.head(5)
df2_top_5 = df2.head(5)
df3_top_5 = df3.head(5)

# Get the words from each dataframe
df1_words = df1_top_5['word'].values
df2_words = df2_top_5['word'].values
df3_words = df3_top_5['word'].values

# Get the common words between dataframes
common_words = set(df1_words).intersection(df2_words, df3_words)

# get not common words
df1_not = set(df1_words) - common_words
df2_not = set(df2_words) - common_words
df3_not = set(df3_words) - common_words

# Get the words that are not in the top 5 for each dataframe
df1_not_in_top_5 = set(df1_words) - set(df2_words) - set(df3_words)
df2_not_in_top_5 = set(df2_words) - set(df1_words) - set(df3_words)
df3_not_in_top_5 = set(df3_words) - set(df1_words) - set(df2_words)

# Get the words to keep for each dataframe
df1_words_to_keep = list(df1_words) + list(df2_not) + list(df3_not)
df2_words_to_keep = list(df2_words) + list(df1_not) + list(df3_not)
df3_words_to_keep = list(df3_words) + list(df1_not) + list(df2_not)

In [None]:
# Rename columns to indicate which dataframe they come from
# Filter each dataframe to only keep the words to keep
df1 = df1[df1['word'].isin(df1_words_to_keep)].rename(columns={'count': '2017'})
df2 = df2[df2['word'].isin(df2_words_to_keep)].rename(columns={'count': '2018'})
df3 = df3[df3['word'].isin(df3_words_to_keep)].rename(columns={'count': '2022'})


# Merge dataframes
result = pd.merge(df1, df2, on='word', how='outer')
result = pd.merge(result, df3, on='word', how='outer')
result

In [None]:
# Fill NaN values with 0
result = result.fillna(0)
colors = ['slateblue', 'coral', 'orange','red', 'orchid', 'pink', 'lightblue', 'green', 'palevioletred', 'navy', 'firebrick','darkgray', 'lightgreen']

# Plot data
for word, color in zip(result['word'], colors):
    x = [1, 2, 6]
    y = result.loc[result['word'] == word, result.columns[1:]].values[0]
    plt.plot(x, y, marker='o', linestyle='-', label=word, color = color)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.xticks(x, result.columns[1:])
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Appearences with the word "victim"')
plt.show()