# Path

In [None]:
library('igraph')
g <- read.graph('./data/movie_edge_weight.txt', format='ncol', directed=FALSE)

# Path

In [None]:
#genreate genre
movie_genre_file <- file("./data/movieID_genre.txt", open="r")
line <- readLines(movie_genre_file, 1, encoding="latin1")
genre_list <- rep("", vcount(g))
while(length(line) != 0) {
    line <- strsplit(line,"\t\t")
    genre <- line[[1]][3]
    movie_id <- line[[1]][2]
    node_id <- V(g)[V(g)$name == movie_id]
    genre_list[node_id] <- genre
    line <- readLines(movie_genre_file, 1, encoding="latin1")
} 
close(movie_genre_file)

In [None]:
# assign the genre to the nodes in network
V(g)$genre <- genre_list

In [None]:
# redundant step for this case
vertices <- V(g)[genre_list != ""]
g.genre <- induced_subgraph(g, vertices, impl = "copy_and_delete")

# Fast Greedy community detection
g.genre.fgc <- cluster_fast_greedy(g.genre)

### Question 7

### print_community_genre_info

In [None]:
# generate n random integers
n <- 10
communities <- sample(1:length(g.genre.fgc), n, replace = FALSE)

### print_community_genre_info

In [None]:
print_community_genre_info <- function(community_picked, community_set) {
    for (i in community_picked) {
        genre_list <- V(g.genre)[community_set[[i]]]$genre
        genre_list <- genre_list[genre_list != "None"]
        genre.table <- table(genre_list)
        genre.freq <- prop.table(genre.table)
    
        cat("======= Community", i, "=======\n")
        cat("============================\n")
    
        Genre <- names(genre.table)
        Count <- as.vector(genre.table)
        Freq <- as.vector(genre.freq)
        df <- data.frame(Genre, Count, Freq)
        print(df)
        cat("\n\n")
    }
}

In [None]:
print_community_genre_info(communities, g.genre.fgc)

### plot_community_genre_info

In [None]:
plot_community_genre_info <- function(community_picked, community_Set, save_path) {

    for (i in community_picked) {
        genre_list <- V(g.genre)[community_Set[[i]]]$genre
        genre_list <- genre_list[genre_list != "None"]
        genre.table <- table(genre_list)
        genre.freq <- prop.table(genre.table)
    
        Genre <- names(genre.table)
        Count <- as.vector(genre.table)
        Freq <- as.vector(genre.freq)
    
        title = sprintf("Genre Distribution in Community %d", i)
        f_name <- sprintf("/D_Com_%d.png", i)
        f_name <- paste(save_path, f_name, sep="")
        png(filename=f_name)
        barplot(Count, main = title, ylab = "Count", names.arg = Genre, las=2)
        dev.off()
    }
}

# Path

In [None]:
save_path = "./plots/Q7"
plot_community_genre_info(communities, g.genre.fgc, save_path)

### Question 8(a)

In [None]:
print_community_genre_info(1:length(g.genre.fgc), g.genre.fgc)

# Path

In [None]:
save_path <- "./plots/Q8a"
plot_community_genre_info(1:length(g.genre.fgc), g.genre.fgc, save_path)

### Question 8(b)

In [None]:
# genre score
genre_score <- function(c, p, q) {
    log(c, base = exp(1)) * p / q
}

In [None]:
# calculate the the fraction of genre i movies in the entire data set
genre_all <- V(g.genre)$genre
genre_all <- genre_all[genre_all != "None"]
genre.table.all <- table(genre_all)
genre.freq.all <- prop.table(genre.table.all)

### print_community_genre_score

In [None]:
print_community_genre_score <- function(community_picked, community_set) {
    for (i in community_picked) {
        genre_list <- V(g.genre)[community_set[[i]]]$genre
        genre_list <- genre_list[genre_list != "None"]
        genre.table <- table(genre_list)
        genre.freq <- prop.table(genre.table)
    
        Genre <- names(genre.table)
        Count <- as.vector(genre.table)
        Freq <- as.vector(genre.freq)
        index = names(genre.table.all) %in% Genre
        q <- as.vector(genre.freq.all)[index]
        score <- genre_score(Count, Freq, q)
    
        cat("======= Community", i, "=======\n")
        cat("============================\n")
        df <- data.frame(Genre, score)
        print(df)
        cat("\n\n")
    }
}

In [None]:
print_community_genre_score(1:length(g.genre.fgc), g.genre.fgc)

### plot_community_genre_score

In [None]:
plot_community_genre_score <- function(community_picked, community_set, save_path) {
    for (i in community_picked) {
        genre_list <- V(g.genre)[community_set[[i]]]$genre
        genre_list <- genre_list[genre_list != "None"]
        genre.table <- table(genre_list)
        genre.freq <- prop.table(genre.table)
    
        Genre <- names(genre.table)
        Count <- as.vector(genre.table)
        Freq <- as.vector(genre.freq)
    
        index = names(genre.table.all) %in% Genre
        q <- as.vector(genre.freq.all)[index]
        score <- genre_score(Count, Freq, q)
    
        title <- sprintf("Genre Score in Community %d", i)
        f_name <- sprintf("/D_Com_%d_score.png", i)
        f_name <- paste(save_path, f_name, sep="")
        png(filename=f_name)
        barplot(score, main = title, ylab = "Score", names.arg = Genre, las=2)
        dev.off()
    }
}

# Path

In [None]:
save_path = "./plots/Q8b"
plot_community_genre_score(1:length(g.genre.fgc), g.genre.fgc, save_path)

### Question 8(c)

### creat_edgelist

In [None]:
creat_edgelist <- function(file_path, movie_id_list) {
    # build the edgelist for bipartite graph
    movie_actors_file <- file(file_path, open="r")
    line <- readLines(movie_actors_file, 1, encoding="latin1")

    # index of movie_id_list
    idx <- 1

    # edgelist
    movie_id_edge <- c()
    actor_id_edge <- c()

    while(length(line) != 0) {    
        line <- strsplit(line,"\t\t")
        movie_id <- line[[1]][2]
    
        if (movie_id == movie_id_list[idx]) {
            actor_id <- line[[1]][3:length(line[[1]])]
            movie_id_edge <- c(movie_id_edge, rep(paste("m_id",movie_id),
                                              length(actor_id)))
            actor_id_edge <- c(actor_id_edge, actor_id)
            idx <- idx + 1
        }

        if (idx > length(movie_id_list)) {
            break
        }
        line <- readLines(movie_actors_file, 1, encoding="latin1")
    } 
    close(movie_actors_file)
    list(actor_id_edge, movie_id_edge)
}

### build_bipartite_graph

In [None]:
build_bipartite_graph <- function(actor_id_edge, movie_id_edge) {
    # build the bipartite graph
    edge_df <- data.frame(actor_id_edge, movie_id_edge)
    g.bi <- graph_from_data_frame(edge_df)
    V(g.bi)$type <- V(g.bi)$name %in% actor_id_edge
    g.bi
}

### plot_bipartite_graph

In [None]:
plot_bipartite_graph <- function(bipartite_graph, vsize=6, ncom) {
    l <- layout_as_bipartite(bipartite_graph)
    title <- sprintf("Actors - Movies(Community %s)", ncom)
    plot(bipartite_graph, layout=l[, c(2,1)], vertex.size= vsize, asp=0, 
         vertex.label=NA, main=title)    
}

### get_actor_movie_df
actor id - movie number

In [None]:
get_actor_movie_df <- function(bipartite_graph, actor_id_edge) {
    # act n movies
    Movie_number <- as.vector(degree(bipartite_graph, mode='out'))
    Movie_number <- Movie_number[Movie_number>0]

    # Actor id
    Actor_id <- as.numeric(names(V(bipartite_graph)[1:length(names(table(actor_id_edge)))]))
    actor_movie_df <- data.frame(Actor_id, Movie_number)
    actor_movie_df
}

### get_movie_actor_df
movie id - actor number

In [None]:
get_movie_actor_df <- function(bipartite_graph, actor_id_edge) {
    # number of actors
    Actor_number <- as.vector(degree(bipartite_graph, mode='in'))
    Actor_number <- Actor_number[Actor_number>0]

    # movie id
    Movie_id <- names(V(bipartite_graph)[(length(names(table(actor_id_edge)))+1):vcount(bipartite_graph)])
    Movie_id <- get_movie_id(Movie_id)
    movie_actor_df <- data.frame(Movie_id, Actor_number)
    movie_actor_df    
}

### get_top_three

In [None]:
get_top_three <- function(actor_movie_df) {
    # the most three important actors id
    top_three <- actor_movie_df[rev(order(actor_movie_df$Movie_number)),]$Actor[1:3]
    top_three <- sort(top_three)
    top_three <- as.character(top_three)
    top_three
}


### get_actor_name

In [None]:
get_actor_name <- function(file_path, top_three) {
    actors_file <- file(file_path, open="r")
    line <- readLines(actors_file, 1, encoding="latin1")

    # index of movie_id_list
    idx <- 1

    # actor name
    actor_name <- c()

    while(length(line) != 0) {    
        line <- strsplit(line,"\t\t")
        actor_id <- line[[1]][2]
    
        if (actor_id == top_three[idx]) {
            actor_name <- c(actor_name, line[[1]][1])
            idx <- idx + 1
        }

        if (idx > length(top_three)) {
            break
        }
        line <- readLines(actors_file, 1, encoding="latin1")
    } 
    close(actors_file)
    actor_name
}


### get_movie_id

In [None]:
get_movie_id <- function(movies) {
    movie_id <- c()
    for (i in 1:length(movies)) {
        id <- strsplit(movies[i], ' ')[[1]][2]
        movie_id <- c(movie_id, id)
    }
    movie_id
}

### get_movie_genre

In [None]:
get_movie_genre <- function(genre_graph, movie_id) {
    genre <- V(genre_graph)[V(genre_graph)$name %in% movie_id]$genre
    genre
}

### get_movie_name

In [None]:
get_movie_name <- function(file_path, movie_id) {
    movie_genre_file <- file(file_path, open="r")
    line <- readLines(movie_genre_file, 1, encoding="latin1")
    movie_name <- c()
    idx <- 1
    while(length(line) != 0) {
        line <- strsplit(line,"\t\t")
        id <- line[[1]][2]
        if (movie_id[idx] == id) {
            movie_name <- c(movie_name, line[[1]][1])
            idx = idx + 1
        }
        if (idx > length(movie_id)) {
            break
        }
        line <- readLines(movie_genre_file, 1, encoding="latin1")
    } 
    close(movie_genre_file)
    movie_name
}

### print_actor_info

In [None]:
print_actor_info <- function(genre_graph, bipartite_graph, top_three, actor_name,
                             file_path) {
    for (i in 1:length(top_three)) {
        movies <- neighbors(bipartite_graph, top_three[i], mode='out')$name
        Movie_ids <- sort(get_movie_id(movies))
        
        Names <- get_movie_name(file_path, Movie_ids)
        
        Genres <- get_movie_genre(genre_graph, Movie_ids)
        
        info.df <- data.frame(Movie_ids, Names, Genres)
        
        
        cat(actor_name[i], ":\n")
        cat("===================================\n")
        print(info.df)
        cat("\n\n")
    }
}



### Start below

In [None]:
small_communities <- g.genre.fgc[sizes(g.genre.fgc) >= 10 & sizes(g.genre.fgc) <= 20]

# plot bipartite graph
file_path <- "./data/movie_actors.txt"

for (i in 1:length(small_communities)) {
    movie_id_list <- sort(as.numeric(V(g.genre)[small_communities[[i]]]$name))
    
    edgelist <- creat_edgelist(file_path, movie_id_list)
    actor_id_edge <- edgelist[[1]]
    movie_id_edge <- edgelist[[2]]
    
    g.bi <- build_bipartite_graph(actor_id_edge, movie_id_edge)
    plot_bipartite_graph(g.bi, ncom=names(small_communities)[i])
}


In [None]:
# genrate information about graph and top three actors
actor_id_path = "./data/actor_id.txt" 
movie_genre_path <- "./data/movieID_genre.txt"

for (i in 1:length(small_communities)) {
i = 1
    # find all movies in the current community
    movie_id_list <- sort(as.numeric(V(g.genre)[small_communities[[i]]]$name))
    
    edgelist <- creat_edgelist(file_path, movie_id_list)
    actor_id_edge <- edgelist[[1]]
    movie_id_edge <- edgelist[[2]]
    
    g.bi <- build_bipartite_graph(actor_id_edge, movie_id_edge)
    
    # get actor - movie number data frame
    actor_movie_df <- get_actor_movie_df(g.bi, actor_id_edge)
    
    # get actor name - id - movie data frame
    actor_movie_df <- actor_movie_df[order(actor_movie_df$Actor_id),]
    actor_id <- actor_movie_df$Actor_id
    Actor_name <- get_actor_name(actor_id_path, actor_id)
    name_id_movie_df <- data.frame(Actor_name, actor_movie_df)

    # get movie - actor number data frame
    movie_actor_df <- get_movie_actor_df(g.bi, actor_id_edge)
    
    # get movie name - id - actor data frame
    movie_actor_df <- movie_actor_df[order(movie_actor_df$Movie_id),]
    movie_id <- movie_actor_df$Movie_id
    Movie_name <- get_movie_name(movie_genre_path, movie_id)
    name_id_actor_df <- data.frame(Movie_name, movie_actor_df)
    
    cat("************************** Community",
        names(small_communities)[i], "**************************\n")
    cat("******************************************************************\n")
    print(name_id_movie_df)
    cat("\n")
    print(name_id_actor_df)
    cat("\n")

    # get three most important actor id and name
    top_three <- get_top_three(actor_movie_df)
    actor_name <- get_actor_name(actor_id_path, top_three)
    
    # print three most important actor information
    # movie id, movie name, genre
    print_actor_info(g.genre, g.bi, top_three, actor_name, movie_genre_path)
    cat("\n\n")
}

# Path

In [None]:
save_path = "./plots/Q8c"
plot_community_genre_info(as.numeric(names(small_communities)), g.genre.fgc, save_path)