- The dataset
- Exploratory analysis
- Graph Construction
- Significant Character Tropes
- Most Common Roles, Actions and Descriptions
library(tidyverse)
library(quanteda) #for text cleaning
library(igraph) #for creating graphs
library(visNetwork) #for visualizing graphs
library(gridExtra)
library(gtable)
library(wordcloud)
source("calculatecoocstats.R") #calculate co-occurrence statistics
source("grapher.R") #create graph
source("graphervf.R") #grapher 2
source("grapherdemo.R") #other grapher
source("token_filter.R") #filter tokens
load("token.all.RData")
First I retrieved movie plots from Wikipedia using the rvest package.
library(rvest)
I wrote a function to retrieve ‘n’ number of plots for each year across a specified decade. I made the function return a string of plots for the entire decade.
plot_scraper <- function(decade = 1940, n = 200){ #declare function
s <- character() #initialize string to store plots
for(j in 0 : 9){ #for loop to run for 10 years in decade
year_full = decade + j #year initialized to decade start in main script
print(year_full)
s_ind <- character()
#create url for scraping - attach year to Wikipedia URL
url_start <- "https://en.wikipedia.org"
url_mid <- "/wiki/List_of_American_films_of_"
year_start = substr(year_full, 1, 2) #get first two numbers of year
year_end = substr(year_full, 3, 3) #get third number
url_end <- paste(year_start, paste(year_end, j, sep = ""), sep = "") #paste them together along with j
url <- paste(url_start, url_mid, url_end, sep = "") #paste everything together for complete URL
#extract plot info
page <- read_html(url) #read html from url
#access the links for different movies in that particular year
links <- html_nodes(page, "table.wikitable td i a")
#extract hyperlinks to movie main page
links.href <- html_attr(links, "href")
#create accessible URLs to each of the films
plot.links <- paste("https://en.wikipedia.org", links.href, sep = "")
#detect and delete dead/red links and buggy links (important)
plot.links <- plot.links[!plot.links %>% str_detect("redlink", negate = FALSE)]
plot.links <- plot.links[!plot.links %>% str_detect("Fly_by_Night", negate = FALSE)]
plot.links <- plot.links[!plot.links %>% str_detect("Monolith", negate = FALSE)]
plot.links <- plot.links[!plot.links %>% str_detect("The_Cruel_Path", negate = FALSE)]
#take a random sample of size n from obtained links
if(length(plot.links) > n){plot.links = sample(plot.links, n)}
#initialize string to hold plots
plot <- character()
#extract plots from each individual Wikipedia page for the movies
for(i in 1 : n){
#take only things under the plot heading
plot[i] <- plot.links[i]
if(!is.na(plot[i])){
plot[i] <- plot.links[i] %>% read_html() %>%
html_nodes(xpath = '//p[preceding::h2[1][contains(.,"Plot")]]') %>%
html_text() %>% paste(collapse = "\n")
}
else{
plot[i] <- ""
}
print(i)
}
#remove plots with no info - some movies don't have a plot section
#mostly the less popular ones
s_ind <- plot[plot != ""] #store all non-empty plots for the year into a new variable
s <- c(s, s_ind) #bind the string for individual year with the string for the entire decade
}
return(s)
}
I also wrote a function to tokenise a given string with information about the part of speech of each word using the spacy library. Additionally I also used the genderizeR package to classify character names to their genders. Using this information, I converted all male character names to one single category and all female character names to a single category.
#connect your python to the environment where you installed spacy
reticulate::use_virtualenv("~/spacynlp-env", required = TRUE)
reticulate::py_config() #check whether configuration is right
#load libraries
library(spacyr) #for NLP
spacy_initialize(model = "en_core_web_sm") #spacy language model
library(quanteda) #for text cleaning
library(genderizeR) #for assigning gender
film_tokenizer <- function(plot_string, metadata){ #declare function
#convert string into text corpus for cleaning
print("converting into corpus") #print status
s <- corpus(pllot_strings, docvars = metadata) #make a corpus object and attach doc variables (year info)
s <- corpus_reshape(s, to = "sentences") #split it into sentences
docvars_complete <- docvars(s) #count number of sentences - useful later when assigning docvars to spacy object
#parse it into tokens using spacy - this is where the magic happens
print("starting parse using spacy") #print status
toks.spacy <- spacy_parse(s) %>%
entity_consolidate() %>% #this combines single entities into one unit
#by replacing ' ' with '_' e.g. John Locke becomes John_Locke
as.tokens(include_pos = "pos") #include parts of speech information
#extract entities person entities from the text i.e. movie characters
print("tokens intitial parse") #print status
ents.spacy <- spacy_parse(s, entity = TRUE) %>%
entity_extract(concatenator = "_") %>% #extract entities
filter(entity_type == "PERSON") %>% #filter persons
distinct(entity) #find distrinct persons i.e. remove duplicates
#extract gender information
print("starting gender extract") #print status
ents.spacy.temp <- ents.spacy %>% mutate(entity = str_replace_all(entity, "[_]", " ")) #replace '_' with ' ' for gender extraction
print("finding given names") #print status
#find given names
givenNames = findGivenNames(ents.spacy.temp$entity, progress = FALSE, apikey =
"31d8c048c93f385ba2de144836d8d0f5") #identify given names
#find gender from given names
gender_data = genderize(ents.spacy.temp$entity, genderDB = givenNames, progress = FALSE)
#store all entities
entity.all <- gender_data %>% mutate(name = ents.spacy$entity) %>% select(name, gender) #keep only required columns
#filter gender entities
print("gender extraction complete") #print status
entity.male <- entity.all %>% filter(gender == "male") #filter male entities
entity.female <- entity.all %>% filter(gender == "female") #filter female entities
n.males <- nrow(entity.male) #count male entites
n.female <- nrow(entity.female) #count female entities
#create string to use for replacing individual character names to general term
m.repl <- rep("Male/CHARACTERS", n.males) #males
f.repl <- rep("Female/CHARACTERS", n.female) #females
#create final tokens
print("creating tokens") #print status
toks.all <- toks.spacy %>%
tokens_select(pattern = c("*/NOUN", "*/VERB", "*/ENTITY", "*/ADJ")) %>% #select only nouns, verbs, adjectives, entities
tokens_replace(pattern = paste(entity.male$name, "/ENTITY", sep = ""), replacement = m.repl) %>% #replce ind. characters with genereal term - male
tokens_replace(pattern = paste(entity.female$name, "/ENTITY", sep = ""), replacement = f.repl) %>% #replce ind. characters with genereal term - female
tokens_remove(c("", "'s", "-", "ex", "-/NOUN", "*/ENTITY", "-/ADJ", "-/VERB")) #remove buggy tokens
print("done") #print status
toks.all$decade = docvars_complete #assign decade to tokens
return(toks.all)
}
I created the dataset for my analysis using the two functions above. First I used the plot scraper function to extract movie plots from all decades.
#scrape all plots across all decades
s_all.i <- character() #declare string to hold all plot info
year_plots <- data.frame() #declare data frame to hold info on number of plots scraped per year
for(i in seq(from=1940, to=2010, by=10)){ #run for loop to get each decade from 1940 to 2010
print(paste(i, "start", sep = " - ")) #print status
s_this.i <- plot_scraper(i, 200) #plot 200 movies from every year, across the entire decade i
s_all.i <- c(s_all.i, s_this.i) #merge with string for overall data
n_plots = length(s_this.i) #find number of plots in particular decade
year_plots.temp <- data.frame(year = as.character(i), times = n_plots) #organise
year_plots <- rbind(year_plots, year_plots.temp) #bind to overall data for plots/decade info
}
s_docvars <- rep(year_plots.final$year, times = year_plots.final$times) #create docvars to use for tokenizing (year info for each plot)
After extracting all the movie plots, I used the film tokenizer function to tokenize the movie plots.
#tokenise data
token.all <- film_tokenizer(s_all.i, s_docvars)
A look at a sample of the final dataset
head(token.all, 5)
## Tokens consisting of 5 documents and 1 docvar.
## text1.1 :
## [1] "father/NOUN" "Male/CHARACTERS" "Male/CHARACTERS" "joins/VERB"
## [5] "promised/VERB" "land/NOUN"
##
## text1.2 :
## [1] "Male/CHARACTERS" "learns/VERB" "father/NOUN" "died/VERB"
## [5] "military/ADJ" "expedition/NOUN" "consoled/VERB" "schoolmate/NOUN"
## [9] "friend/NOUN" "Male/CHARACTERS" "Male/CHARACTERS"
##
## text1.3 :
## [1] "Male/CHARACTERS" "adult/NOUN" "accomplished/ADJ"
## [4] "backwoodsman/NOUN" "sells/VERB" "family/NOUN"
## [7] "farm/NOUN" "order/NOUN" "settle/VERB"
##
## text1.4 :
## [1] "saying/VERB" "Male/CHARACTERS" "played/VERB" "adult/NOUN"
## [5] "Male/CHARACTERS" "Male/CHARACTERS" "tricked/VERB" "meeting/VERB"
## [9] "several/ADJ" "members/NOUN" "high/ADJ" "society/NOUN"
## [ ... and 10 more ]
##
## text1.5 :
## [1] "snub/VERB" "discover/VERB" "common/ADJ" "farmer/NOUN"
## [5] "landed/ADJ" "gentlemen/NOUN" "Male/CHARACTERS" "implied/VERB"
token.all <- tokens_tolower(token.all) #convert all tokens to lower
token.all = token.all %>% tokens_remove(c('ex/adj', 'ex/noun'))
#sample based on min in a decade
set.seed(42)
#token.all = tokens_sample(token.all, size = 22638, replace = FALSE, prob = NULL, by = decade)
Since the data obtained is unequal for each decade due to some inherent factors, I wanted to check how the certain parameters are distributed acrosst he decades.
#plot number of movies across decades
load('n_movies.Rdata')
head(n_movies)
ggplot(n_movies, aes(x = as.factor(year), y = n)) +
geom_bar(stat = 'identity', width = 0.5, color = 'black',
position = position_dodge(width = 0.4)) +
ylab('no. of plots') + xlab('decade') +
geom_text(aes(label = n, vjust=-0.3), size = 3) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line('black'), axis.line.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank())
#average number of sentences per plot across decades
sents_df = data.frame(decade = as.character(),
n_sents = as.numeric())
for(i in seq(1940, 2010, 10)){
n_sents = ndoc(tokens_subset(token.all, decade == i))
sents_t = data.frame(decade = as.character(i),
n_sents = as.numeric(n_sents))
sents_df = rbind(sents_df, sents_t)
}
n_movies$sents_per_plot <- sents_df$n_sents/n_movies$n
ggplot(n_movies, aes(x = as.factor(year), y = sents_per_plot)) +
geom_bar(stat = 'identity', width = 0.5, color = 'black',
position = position_dodge(width = 0.4)) +
ylab('sentences/plot') + xlab('decade') +
geom_text(aes(label = round(sents_per_plot, 2), vjust=-0.3), size = 3) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line('black'), axis.line.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank())
#plot the number of sentences in each decade
ggplot(sents_df, aes(x = decade, n_sents)) +
geom_bar(stat = 'identity', width = 0.5, color = 'black',
position = position_dodge(width = 0.4)) +
ylab('no. of sentences') +
geom_text(aes(label = round(n_sents, 2), vjust=-0.3), size = 3) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line('black'), axis.line.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank())
#plot number of words per sentence across decades
words_df = data.frame(decade = as.character(),
n_words = as.numeric())
for(i in seq(1940, 2010, 10)){
n_words = sum(ntoken(tokens_subset(token.all, decade == i)))
words_t = data.frame(decade = as.character(i),
n_words = as.numeric(n_words))
words_df = rbind(words_df, words_t)
}
words_df$wordspsents = words_df$n_words/sents_df$n_sents
ggplot(words_df, aes(x = decade, y = wordspsents)) +
geom_bar(stat = 'identity', width = 0.5, color = 'black',
position = position_dodge(width = 0.4)) +
ylab('words/sentence') +
geom_text(aes(label = round(wordspsents, 2), vjust=-0.3), size = 3) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line('black'), axis.line.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank())
grapherdemo <- function(numberOfCoocs, toks, measure = "LOGLIK"){
#oppositeg = ifelse(coocTerm == 'male/characters', 'female/characters', 'male/characters')
#coocTerm = 'male/characters'
#### graph df function
graph_df <- function(coocTerm){
minimumFrequency = 10
binDTM <- toks %>%
dfm() %>%
dfm_trim(min_docfreq = minimumFrequency) %>%
dfm_weight("boolean")
coocs <- calculateCoocStatistics(coocTerm, binDTM, measure)
# Display the numberOfCoocs main terms
imm.coocs <- names(coocs[1:numberOfCoocs])
logs <- coocs[1:numberOfCoocs]
resultGraph <- data.frame(from = character(), to = character(), sig = numeric(0))
# The structure of the temporary graph object is equal to that of the resultGraph
tmpGraph <- data.frame(from = character(), to = character(), sig = numeric(0))
# Fill the data.frame to produce the correct number of lines
tmpGraph[1:numberOfCoocs, 3] <- coocs[1:numberOfCoocs]
# Entry of the search word into the first column in all lines
tmpGraph[, 1] <- coocTerm
# Entry of the co-occurrences into the second column of the respective line
tmpGraph[, 2] <- names(coocs)[1:numberOfCoocs]
# Set the significances
tmpGraph[, 3] <- coocs[1:numberOfCoocs]
# Attach the triples to resultGraph
resultGraph <- rbind(resultGraph, tmpGraph)
# Iteration over the most significant numberOfCoocs co-occurrences of the search term
for (i in 1:numberOfCoocs){
# Calling up the co-occurrence calculation for term i from the search words co-occurrences
newCoocTerm <- names(coocs)[i]
coocs2 <- calculateCoocStatistics(newCoocTerm, binDTM, measure="LOGLIK")
#print the co-occurrences
coocs2[1:10]
# Structure of the temporary graph object
tmpGraph <- data.frame(from = character(), to = character(), sig = numeric(0))
tmpGraph[1:numberOfCoocs, 3] <- coocs2[1:numberOfCoocs]
tmpGraph[, 1] <- newCoocTerm
tmpGraph[, 2] <- names(coocs2)[1:numberOfCoocs]
tmpGraph[, 3] <- coocs2[1:numberOfCoocs]
#Append the result to the result graph
resultGraph <- rbind(resultGraph, tmpGraph[2:length(tmpGraph[, 1]), ])
}
# Sample of some examples from resultGraph
#resultGraph[sample(nrow(resultGraph), 6), ]
list_graph = list()
list_graph[[1]] = imm.coocs
list_graph[[2]] = resultGraph
list_graph[[3]] = logs
return(list_graph)
}
male_list = graph_df('male/characters')
female_list = graph_df('female/characters')
male = male_list[[2]]
male_coocs = male_list[[1]]
male_logs = male_list[[3]]
female = female_list[[2]]
female_coocs = female_list[[1]]
female_logs = female_list[[3]]
complete = rbind(male, female)
tail(complete)
complete <- distinct(complete)
nrow(complete)
# set seed for graph plot
set.seed(42)
resultGraph = complete
names(complete)[3] = 'weight'
head(complete)
# Create the graph object as undirected graph
graphNetwork <- graph.data.frame(resultGraph, directed = F)
E(graphNetwork)$weight = complete$weight
is_weighted(graphNetwork)
# Identification of all nodes with less than 2 edges
verticesToRemove <- V(graphNetwork)[degree(graphNetwork) < 2]
# These edges are removed from the graph
#graphNetwork <- delete.vertices(graphNetwork, verticesToRemove)
#imm.coocs
#for vertices #####
#male to female - not needed
#ftm = rowSums(ends(graphNetwork, es = E(graphNetwork), names = T) == c('female/characters', 'male/characters'))
#female primary nodes
fto = ends(graphNetwork, es = E(graphNetwork), names = T)[,1] == 'female/characters'
fto2 = ends(graphNetwork, es = E(graphNetwork), names = T)[,2] == 'female/characters'
#male primary nodes
mto = ends(graphNetwork, es = E(graphNetwork), names = T)[,1] == 'male/characters'
mto2 = ends(graphNetwork, es = E(graphNetwork), names = T)[,2] == 'male/characters'
#female connections
fc = ends(graphNetwork, es = E(graphNetwork), names = T)[,2][as.logical(fto)]
fc2 = ends(graphNetwork, es = E(graphNetwork), names = T)[,1][as.logical(fto2)]
#male connections
mc = ends(graphNetwork, es = E(graphNetwork), names = T)[,2][as.logical(mto)]
mc2 = ends(graphNetwork, es = E(graphNetwork), names = T)[,1][as.logical(mto2)]
main_cm = c(mc, mc2)
main_cf = c(fc, fc2)
maf = intersect(main_cm, main_cf)
intersect = intersect(male_coocs, female_coocs)
# Assign colors to nodes (search term blue, primary green, others orange)
V(graphNetwork)$color <- ifelse(V(graphNetwork)$name == c('male/characters'), adjustcolor('cornflowerblue', alpha = 0.9),
ifelse(V(graphNetwork)$name %in% c('female/characters'), adjustcolor('orange', alpha = 0.9),
ifelse(V(graphNetwork)$name %in% c(intersect), adjustcolor('purple', alpha = 0.8),
ifelse(V(graphNetwork)$name %in% male_coocs, adjustcolor('cornflowerblue', alpha = 0.8),
ifelse(V(graphNetwork)$name %in% female_coocs, adjustcolor('orange', alpha = 0.9), adjustcolor('grey', alpha = 0.4))))))
#V(graphNetwork)$color <- ifelse(V(graphNetwork)$name %in% fc, 'orange', V(graphNetwork)$color)
# Set edge colors
#E(graphNetwork)$color <- adjustcolor("DarkGray", alpha.f = .5)
# scale significance between 1 and 10 for edge width
E(graphNetwork)$width <- scales::rescale(E(graphNetwork)$sig, to = c(1, 10))
E(graphNetwork)$color <- adjustcolor('DarkGray', alpha.f = 0.4)
# Set edges with radius
E(graphNetwork)$curved <- 0.15
# Size the nodes by their degree of networking (scaled between 5 and 15)
V(graphNetwork)$size <- scales::rescale(degree(graphNetwork), to = c(10, 25))
# Define the frame and spacing for the plot
par(mai=c(0,0,1,0))
graph_list <- list()
graph_list[[1]] <- graphNetwork #network object
graph_list[[2]] <- female_logs #names of co-occs (redundant)
graph_list[[3]] <- male_logs #data frame of co-occs and significance
return(graph_list)
}
#add shiny toggle secondary, shiny toggle nodes
graph_demo = grapherdemo(5, token_filter3('all', 1940, 2020, token.all))
g_demo = graph_demo[[1]]
#tkplot(g_demo) #GUI to adjust coordinates of vertices
#coords_demo <- tkplot.getcoords(1) #save coordinates to an object
#save(coords_demo, file = 'coords_demo.RData') #save object
load(file= 'coords_demo.RData')
plot(g_demo, vertex.label.cex = 0.6, vertex.label.dist = 0,
edge.curved=FALSE, layout = coords_demo)
graph = grapherdemo(21, token_filter3('all', 1940, 2020, token.all)) #create graph
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
female_primary = graph[[2]] #20 female primary nodes
male_primary = graph[[3]] #20 male primary nodes
g = graph[[1]] #save graph as g
#visIgraph(g) #%>% visNodes(font = list(size = 26)) #display
plot(g, vertex.size = 3, vertex.label = NA,
vertex.label.dist = 0, vertex.frame.color = ifelse(V(g)$name %in% c(names(male_primary), names(female_primary), 'female/characters'), adjustcolor('black', alpha.f = 1), adjustcolor('darkgray', alpha.f = 0.2)),
edge.curved=FALSE)
#Top secondary co-occurences
#male
dmat = distances(graph[[1]], v=V(graph[[1]]), to='male/characters') #compute path weights
male_c = dmat[, 'male/characters'] #secondary to male
male_c = sort(male_c, decreasing = T)[1:21] #sort top 20
#female
fmat = distances(graph[[1]], v=V(graph[[1]]), to='female/characters') #compute path weights
female_c = fmat[, 'female/characters'] #secondary to male
female_c = sort(female_c, decreasing = T)[1:21] #sort top 20
#store all secondary
allc = c(male_c, female_c)
allc = sort(allc, decreasing = T) #sort decreasing
#edge colors
all_edges = ends(g, es = E(g), names = T) #store all edges
all_edges = as.data.frame(all_edges) #convert to dataframe
#check
all_edges$V2[all_edges$V1 == 'female/characters']
## [1] "daughter/noun" "sister/noun" "love/noun"
## [4] "mother/noun" "husband/noun" "relationship/noun"
## [7] "affair/noun" "house/noun" "marriage/noun"
## [10] "girl/noun" "marry/verb" "woman/noun"
## [13] "pregnant/adj" "wedding/noun" "married/verb"
## [16] "love/noun" "mother/noun" "relationship/noun"
## [19] "affair/noun" "marriage/noun" "girl/noun"
## [22] "woman/noun" "pregnant/adj" "wedding/noun"
#male_c = male_c[names(male_c != 'beach/noun')]
male.sec_bool <- all_edges$V2 %in% names(male_c) #create bool of all male secondary co-oocs
female.sec_bool <- all_edges$V2 %in% names(female_c) #create bool of all female secondary co-oocs
edge.start <- ends(g, es = E(g), names = F)[,1]
# E(g)$color <- ifelse(male.sec_bool == TRUE, V(g)$color[edge.start],
# ifelse(female.sec_bool == TRUE, V(g)$color[edge.start],
# adjustcolor('grey', alpha=0.4)))
male_ps = intersect(all_edges$V1[male.sec_bool], names(male_primary))
female_ps = intersect(all_edges$V1[female.sec_bool], names(female_primary))
all_edges$V1[female.sec_bool]
## [1] "friend/noun" "friend/noun" "is/verb"
## [4] "is/verb" "is/verb" "kill/verb"
## [7] "takes/verb" "love/noun" "love/noun"
## [10] "love/noun" "love/noun" "love/noun"
## [13] "love/noun" "love/noun" "love/noun"
## [16] "relationship/noun" "relationship/noun" "relationship/noun"
## [19] "wedding/noun" "wedding/noun" "wedding/noun"
#color only primary tropes that have a path
#mprimary_tropes = c('is/verb', 'friend/noun', 'takes/verb', 'tells/verb',
# 'kill/verb', 'agent/noun', 'help/noun',
# 'brother/noun', 'former/adj')
mprimary_tropes = male_ps
mprimary_tropes = mprimary_tropes[mprimary_tropes != 'female/characters']
m_pcolor = paste('male/characters', mprimary_tropes)
all_edges$V3 = paste(all_edges$V1, all_edges$V2)
mp_bool = all_edges$V3 %in% m_pcolor
#fprimary_tropes = c('love/noun', 'marriage/noun', 'relationship/noun',
# 'tells/verb')
fprimary_tropes = female_ps
f_pcolor = paste('female/characters', fprimary_tropes)
all_edges$V3 = paste(all_edges$V1, all_edges$V2)
all_edges$V1[all_edges$V2 == 'tells/verb']
## [1] "male/characters" "male/characters"
fp_bool = all_edges$V3 %in% f_pcolor
E(g)$color <- adjustcolor('grey', alpha=0.9)
E(g)$color <- ifelse(mp_bool == TRUE, V(g)$color[edge.start],
ifelse(fp_bool == TRUE, V(g)$color[edge.start],
ifelse(male.sec_bool == TRUE, V(g)$color[edge.start],
ifelse(female.sec_bool == TRUE, V(g)$color[edge.start],
adjustcolor('grey', alpha=0.2)))))
#visIgraph(g)
#all_edges$V3[malet_bool]
V(g)$color <- ifelse(V(g)$name == c('male/characters'), adjustcolor('cornflowerblue', alpha = 0.9),
ifelse(V(g)$name %in% c('female/characters'), adjustcolor('orange', alpha = 0.9),
ifelse(V(g)$name %in% c(intersect(mprimary_tropes, fprimary_tropes)), adjustcolor('purple', alpha = 0.9),
ifelse(V(g)$name %in% mprimary_tropes, adjustcolor('cornflowerblue', alpha = 0.9),
ifelse(V(g)$name %in% fprimary_tropes, adjustcolor('orange', alpha = 0.9),
ifelse(V(g)$name %in% c(names(male_c), names(female_c)), adjustcolor('darkgrey', alpha = 0.9),
adjustcolor('grey', alpha = 0.2)))))))
#V(g)$color <- when(V(g)$name %in% 'male/character', adjustcolor('red', alpha = 0.8))
#visIgraph(g)
plot(g, vertex.size = 3, vertex.label = NA,
vertex.frame.color = ifelse(V(g)$name %in% c(names(male_primary),
names(female_primary),
'female/characters'),
adjustcolor('black', alpha.f = 1),
adjustcolor('darkgray', alpha.f = 0.2)),
vertex.label.dist = 0,
edge.curved=FALSE)
keep_nodes = names(c(allc, male_primary, female_primary))
keep_nodes = c(keep_nodes, 'male/characters', 'female/characters')
remove_nodes = names(V(g))[!names(V(g)) %in% keep_nodes]
g_trim <- g - remove_nodes
# visIgraph(g_trim) %>% visNodes(font = list(size = 26))
plot(g_trim, vertex.size = 3, vertex.label.cex = 0.3,
vertex.label.dist = 0,
edge.curved=FALSE)
# Path weights of most significant trope associated with each primary association
source('grapherdemo.R')
#function to return top most significant tropes
#find shortest paths in unweighted graph to all grey nodes
#find grey nodes
top_trope <- function(gender = 'male/characters'){
all_secondary = V(g)$name[!V(g)$name %in% c(names(male_primary), names(female_primary))]
all_secondary = all_secondary[!all_secondary %in% c('male/characters', 'female/characters')]
filter = ifelse(gender == 'male/characters', 'female/characters', 'male/characters')
#find all shortest paths
a = shortest_paths(
g,
from = gender,
to = all_secondary,
weights = NA
)
l = data.frame()
#filter paths that go through females
for(i in 1:length(a$vpath)){
l_temp = data.frame(start = (a$vpath[[i]]$name)[1], mid = (a$vpath[[i]]$name)[2], end = (a$vpath[[i]]$name)[3])
l = rbind(l, l_temp)
}
l = l %>% filter(mid != filter)
#find weights of all these paths
for(i in 1:nrow(l)){
l[i, 4] = sum(E(g, path = c(l[i,1], l[i,2], l[i,3]))$weight)
}
#head(l)
l = arrange(l, desc(V4))
l <- distinct(l)
return(l)
}
## clean up text
top_male = top_trope('male/characters')
top_female = top_trope('female/characters')
top_male <- top_male %>% filter(mid %in% names(male_primary))
top_female <- top_female %>% filter(mid %in% names(female_primary))
top_male <- top_male %>% group_by(mid) %>% slice(which.max(V4))
top_female <- top_female %>% group_by(mid) %>% slice(which.max(V4))
top_male <- arrange(top_male, desc(V4))
top_female <- arrange(top_female, desc(V4))[1:20,]
####### stacked bar plot of paths ######################
top_male$path = paste(top_male$start, top_male$mid, top_male$end, sep = '--')
names(top_male)[4] = 'llr'
top_female$path = paste(top_female$start, top_female$mid, top_female$end, sep = '--')
names(top_female)[4] = 'llr'
##MALE######
#text cleaning remove '-'
top_male = top_male %>% separate(start, c('A', NA), sep='/') %>%
separate(mid, c('B', NA), sep='/') %>%
separate(end, c('C', NA), sep='/')
top_male$path = paste(top_male$A, top_male$B, top_male$C, sep = '--')
mtt = ggplot(top_male, aes(y = reorder(path, llr), x = llr)) +
geom_point(color = 'deepskyblue') +
geom_segment(aes(x = 0, y = path, xend = llr, yend = path)) +
xlab('loglikelihood ratio') + ylab('word') +
ggtitle('Male') + geom_text(aes(label = round(llr, 2)), hjust=-0.3, size = 3) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line('black'))
#mtt
#geom_bar(stat = 'identity', fill = 'deepskyblue4',
# alpha = 0.7, color = 'black', width = 0.8)
#####FEMALE ######
#text cleaning remove '-'
top_female = top_female %>% separate(start, c('A', NA), sep='/') %>%
separate(mid, c('B', NA), sep='/') %>%
separate(end, c('C', NA), sep='/')
top_female$path = paste(top_female$A, top_female$B, top_female$C, sep = '--')
ftt = ggplot(top_female, aes(y = reorder(path, llr), x = llr)) +
geom_point(color = 'darkorange') +
geom_segment(aes(x = 0, y = path, xend = llr, yend = path)) +
xlab('loglikelihood ratio') + ylab('word') +
ggtitle('Female') + geom_text(aes(label = round(llr, 2)), hjust=-0.3, size = 3) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line.x = element_blank(), axis.line.y = element_line('black'))
#ftt
mtt + xlim(0, 2400) + ylab('network path') + theme(axis.line.x = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_blank())
#ggsave('male_tropes.png', width = 10, height = 5)
ftt + xlim(0, 2400) + ylab('network path') + theme(axis.ticks.x = element_blank(), axis.text.x = element_blank())
#ggsave('female_tropes.png', width = 10, height = 5)
plot_doubleword <- function(term1, term2, gender){
male = data.frame()
for(i in 0 : 7){ #for loop to run across decades
male_temp <- data.frame()
j = 1940 + 10*i
male_temp = grapher(paste(gender,'/characters', sep=''), 10 , token_filter('all', j, token.all), "LOGLIK")[[3]][] #get PPMI data for given decade
#male_ind$rank = 1 : nrow(male_ind) #rank words - redundant
male_temp <- male_temp %>% filter(names == term1) #filter term given
male_temp$year = j #attach year info
male_temp$gender = "male" #assign gender
names(male_temp)[2] = 'll1'
male_ind = male_temp
male_temp = grapher(term1, 10, token_filter('all', j, token.all), "LOGLIK")[[3]][] #get PPMI data for given decade
male_temp <- male_temp %>% filter(names == term2) #filter term given
names(male_temp)[2] = 'll2'
if(dim(male_temp)[1] == 0){
male_temp[1, ] <- c(term2, as.numeric(0))
}
male_ind = cbind(male_temp, male_ind)
male = rbind(male, male_ind)
}
male$ll2 = as.numeric(male$ll2)
male$ll = male$ll1 + male$ll2
male = male %>% select(year, ll)
#check significance
ancova.word <- lm(ll~year, data = male)
R2 = round(summary(ancova.word)[[8]], 2)
p = round(anova(ancova.word)[[5]][1], 2)
label1 = paste('R^2 == ', R2, sep = '')
label2 = paste('p == ', p, sep = '')
label1
label2
#remove '-'
term1 = str_split(term1, '/')[[1]][1]
term2 = str_split(term2, '/')[[1]][1]
#plot
trend_plot = ggplot(male, aes(x = year, y = ll)) +
geom_point(color = "black") +
geom_line(size = 1) +
geom_smooth(method = "lm", se = TRUE, size = 1, alpha = 0.1) +
ylab("Loglikelihood Ratio") + ggtitle(paste(gender, term1, term2, sep = '--')) +
theme(axis.text = element_text(color = "black", size = 12), axis.title = element_text(color = "black", size = 14),
legend.text = element_text(color = "black", size = 12), legend.title = element_text(color = "black", size = 14),
) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line('black'))
#facet_wrap(~ gender)
#check significance
ancova.word <- lm(ll~year, data = male)
est = summary(ancova.word)[[4]][2]
R2 = round(summary(ancova.word)[[8]], 2)
p = round(anova(ancova.word)[[5]][1], 2)
label1 = paste('R^2 == ', R2, sep = '')
label2 = paste('p == ', p, sep = '')
#print(term)
print(label1)
print(label2)
print(round(est, 2))
#panel.grid.major = element_line(colour = "grey50", size = 0.3), panel.grid.minor = element_line(colour = "grey50", size = 0.3)
trend_plot
}
plot_doubleword('love/noun', 'falls/verb', 'female')
## [1] "R^2 == 0.37"
## [1] "p == 0.11"
## [1] -2.58
## `geom_smooth()` using formula 'y ~ x'
ll_bar_m <- function(pos = 'noun', n = 21, xlimit = 1800, yax = 'noun'){
graph = grapherdemo(21, token_filter3(pos, 1940, 2020, token.all))
female_primary = graph[[2]] #20 female primary nodes
male_primary = graph[[3]] #20 male primary nodes
g = graph[[1]] #save graph as g
male_primary = male_primary[names(male_primary) != 'female/characters']
male_primary = male_primary[1:(n-1)]
male_np = data.frame(word = names(male_primary), llr = male_primary)
male_np$gender = 'male'
male_np <- male_np %>% separate(word, c('word', NA), sep = '/')
mn = ggplot(male_np, aes(y = reorder(word, llr), x = llr)) +
geom_point(color = 'deepskyblue') +
geom_segment(aes(x = 0, y = word, xend = llr, yend = word)) +
xlab('loglikelihood ratio') + ylab(yax) +
ggtitle('Male') + geom_text(aes(label = round(llr, 2)), hjust=-0.3, size = 3) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line.y = element_line('black')) + theme(axis.line.x = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_blank())
ggsave('male_adj_needle.png', width = 10, height = 5)
female_primary = female_primary[names(female_primary) != 'male/characters']
female_primary = female_primary[1:20]
female_np = data.frame(word = names(female_primary), llr = female_primary)
female_np$gender = 'female'
female_np <- female_np %>% separate(word, c('word', NA), sep = '/')
fn = ggplot(female_np, aes(y = reorder(word, llr), x = llr)) +
geom_point(color = 'darkorange') +
geom_segment(aes(x = 0, y = word, xend = llr, yend = word)) +
xlab('loglikelihood ratio') + ylab(yax) +
ggtitle('Female') + geom_text(aes(label = round(llr, 2)), hjust=-0.3, size = 3) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line.y = element_line('black')) + theme(axis.ticks.x = element_blank(), axis.text.x = element_blank())
par(mfrow=c(1,2))
wordcloud(words = male_np$word, freq = male_np$llr, colors = 'deepskyblue3')
wordcloud(words = female_np$word, freq = male_np$llr, colors = 'darkorange3')
mn + xlim(0, xlimit)
#fn + xlim(0, xlimit)
}
ll_bar_f <- function(pos = 'noun', n = 21, xlimit = 1800, yax = 'noun'){
graph = grapherdemo(21, token_filter3(pos, 1940, 2020, token.all))
female_primary = graph[[2]] #20 female primary nodes
male_primary = graph[[3]] #20 male primary nodes
g = graph[[1]] #save graph as g
male_primary = male_primary[names(male_primary) != 'female/characters']
male_primary = male_primary[1:(n-1)]
male_np = data.frame(word = names(male_primary), llr = male_primary)
male_np$gender = 'male'
male_np <- male_np %>% separate(word, c('word', NA), sep = '/')
mn = ggplot(male_np, aes(y = reorder(word, llr), x = llr)) +
geom_point(color = 'deepskyblue') +
geom_segment(aes(x = 0, y = word, xend = llr, yend = word)) +
xlab('loglikelihood ratio') + ylab(yax) +
ggtitle('Male') + geom_text(aes(label = round(llr, 2)), hjust=-0.3, size = 3) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line.y = element_line('black')) + theme(axis.line.x = element_blank(), axis.ticks.x = element_blank(), axis.text.x = element_blank())
female_primary = female_primary[names(female_primary) != 'male/characters']
female_primary = female_primary[1:20]
female_np = data.frame(word = names(female_primary), llr = female_primary)
female_np$gender = 'female'
female_np <- female_np %>% separate(word, c('word', NA), sep = '/')
fn = ggplot(female_np, aes(y = reorder(word, llr), x = llr)) +
geom_point(color = 'darkorange') +
geom_segment(aes(x = 0, y = word, xend = llr, yend = word)) +
xlab('loglikelihood ratio') + ylab(yax) +
ggtitle('Female') + geom_text(aes(label = round(llr, 2)), hjust=-0.3, size = 3) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line.y = element_line('black')) + theme(axis.ticks.x = element_blank(), axis.text.x = element_blank())
ggsave('female_adj_needle.png', width = 10, height = 5)
# par(mfrow=c(1,2))
# wordcloud(words = male_np$word, freq = male_np$llr, colors = 'deepskyblue3')
# wordcloud(words = female_np$word, freq = male_np$llr, colors = 'darkorange3')
#mn + xlim(0, xlimit)
fn + xlim(0, xlimit)
}
plot_word_single <- function(term, gender){
male = data.frame()
for(i in 0 : 7){ #for loop to run across decades
male_temp <- data.frame()
j = 1940 + 10*i
male_temp = grapher(paste(gender,'/characters', sep=''), 10 , token_filter('all', j, token.all), "LOGLIK")[[3]][] #get PPMI data for given decade
#male_ind$rank = 1 : nrow(male_ind) #rank words - redundant
male_temp <- male_temp %>% filter(names == term) #filter term given
male_temp$year = j #attach year info
male_temp$gender = "male" #assign gender
names(male_temp)[2] = 'll'
male_ind = male_temp
male = rbind(male, male_ind)
}
male = male %>% select(year, ll)
#plot
ggplot(male, aes(x = year, y = ll)) +
geom_point(color = "black") +
geom_line(size = 1) +
geom_smooth(method = "lm", se = TRUE, size = 1, alpha = 0.1) + theme_minimal() +
ylab("Loglikelihood") + ggtitle(paste(gender, term, sep = '-')) +
theme(axis.text = element_text(color = "black", size = 12), axis.title = element_text(color = "black", size = 14),
legend.text = element_text(color = "black", size = 12), legend.title = element_text(color = "black", size = 14),
panel.grid.major = element_line(colour = "grey50", size = 0.3), panel.grid.minor = element_line(colour = "grey50", size = 0.3))
#facet_wrap(~ gender)
}
plot_word_single('kill/verb', 'male')
## `geom_smooth()` using formula 'y ~ x'