# Making networks with words

**The idea, in brief**

Words are the nodes in the network.

If two words appear together in some context that we define – in the same sentence, paragraph, document, etc. – then we draw an edge between the two words

## Read in the seasons corpus

**Load the corpus**

Here I'm loading it with each utterance separate

In [None]:
from seasons_module import load_seasons_corpus_as_utterances
seasons_corpus_with_utterances = load_seasons_corpus_as_utterances()

In [None]:
all_utterances = []
for entry in seasons_corpus_with_utterances.values():
    all_utterances += entry[0]

## Compile the vocabulary in the usual way.

In [None]:
f = open("lists/seasons_stop_list.txt")
stop_list = set(f.read().split("\n"))

In [None]:
import nltk
word_fdist = nltk.FreqDist()
for utterance in all_utterances:
    new_utterance = [w for w in utterance if w not in stop_list]
    word_fdist.update(new_utterance)

In [None]:
vocab_size = 25
most_common = word_fdist.most_common(25)
vocab = [mc[0] for mc in most_common]
print(vocab)

## Create an adjacency matrix

Create a vector for each utterance, then put them together into a termxdocument matrix

In [None]:
import numpy as np
def compute_vector(words, vocab):
    new_vector = []
    for w in vocab:
        tf = words.count(w)
        new_vector.append(tf)
    return np.array(new_vector)

In [None]:
utterance_vector_list = []
for utterance in all_utterances:
    utterance_vector_list.append(compute_vector(utterance, vocab))

td_matrix = np.array(utterance_vector_list).transpose()

Here's the trick: a termxterm matrix **is** an adjacency matrix for the terms

In [None]:
tt_matrix = np.dot(td_matrix, td_matrix.transpose())

In [None]:
import matplotlib
import matplotlib.pyplot as plt

def matrix_heatmap(mtx, name_list):
    fig=plt.figure(figsize=(10, 10), dpi= 80, facecolor='w', edgecolor='k')
    n = len(name_list)
    x_tick_marks = np.arange(n)
    y_tick_marks = np.arange(n)
    plt.xticks(x_tick_marks, name_list, fontsize=8, rotation=90)
    plt.yticks(y_tick_marks, name_list, fontsize=8)
    plt.tick_params("x", top=True, labeltop=True, bottom=False, labelbottom=False)
    plt.imshow(mtx, norm=matplotlib.colors.LogNorm(), interpolation='nearest', cmap='YlOrBr')

In [None]:
matrix_heatmap(tt_matrix, vocab)

## Build a graph from the matrix

The function in the next cell creates a note for each word in the vocabulary, then it uses the adjacency matrix to draw weighted edges between each node.

In [None]:
%matplotlib inline
import networkx as nx
import matplotlib.pyplot as plt

def build_weighted_graph_from_Aij(am, vocab):
    g = nx.Graph()
    for w in vocab:
        g.add_node(w)
    dim = len(am)
    for r in range(dim):
        for c in range(dim):
            if r != c and am[r][c] != 0:
                g.add_weighted_edges_from([(vocab[r], vocab[c], am[r][c])])
    return g

In [None]:
G = build_weighted_graph_from_Aij(tt_matrix, vocab)

In [None]:
weights = [G[u][v]['weight'] / 5 for u, v in G.edges()]

In [None]:
pos = nx.spring_layout(G, k=.6)
plt.figure(figsize=(30, 30))
plt.axis('off')
nx.draw_networkx(G, pos=pos, width=weights, alpha = 1, font_size = 20, node_size = 500)

## Create a subgraph that only contains edges with a certain weight and above

In [None]:
edges_to_keep = []
for e in G.edges:
    if G.edges[e[0], e[1]]["weight"] > 20:
        edges_to_keep.append(e)
gs = G.edge_subgraph(edges_to_keep)
weights = [gs[u][v]['weight'] / 5 for u, v in gs.edges()]

In [None]:
pos = nx.spring_layout(gs, k=.6)
plt.figure(figsize=(30, 30))
plt.axis('off')
nx.draw_networkx(gs, pos=pos, width=weights, alpha=.5, font_size = 30, node_size = 500)