In [13]:
#Special module written for this class
#This provides access to data and to helper functions from previous weeks
#Make sure you update it before starting this notebook
import lucem_illud_2020 #pip install -U git+git://github.com/Computational-Content-Analysis-2020/lucem_illud_2020.git

#All these packages need to be installed from pip

#This will be doing most of the work
import networkx as nx

import sklearn #For generating some matrices
import pandas as pd #For DataFrames
import numpy as np #For arrays
import matplotlib.pyplot as plt #For plotting
import seaborn #Makes the plots look nice
import scipy #Some stats
import nltk #a little language code
from IPython.display import Image #for pics

import pickle #if you want to save layouts
import os

from os import listdir
from os.path import isfile, join

# comp-linguistics
import spacy
import nltk
from spacy import displacy
#Using NLP in spanish
nlp = spacy.load("es_core_news_sm")

%matplotlib inline

In [20]:
def word_tokenize(word_list):
    tokenized = []
    # pass word list through language model.
    doc = nlp(word_list)
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

def wordCooccurrence(sentences, makeMatrix = False):
    words = set()
    for sent in sentences:
        words |= set(sent)
    wordLst = list(words)
    wordIndices = {w: i for i, w in enumerate(wordLst)}
    wordCoCounts = {}
    #consider a sparse matrix if memory becomes an issue
    coOcMat = np.zeros((len(wordIndices), len(wordIndices)))
    for sent in sentences:
        for i, word1 in enumerate(sent):
            word1Index = wordIndices[word1]
            for word2 in sent[i + 1:]:
                coOcMat[word1Index][wordIndices[word2]] += 1
    if makeMatrix:
        return coOcMat, wordLst
    else:
        coOcMat = coOcMat.T + coOcMat
        g = nx.convert_matrix.from_numpy_matrix(coOcMat)
        g = nx.relabel_nodes(g, {i : w for i, w in enumerate(wordLst)})
        return g

In [10]:
text_folder = 'txt'
text_dic = {}
onlyfiles = [f for f in listdir(text_folder) if isfile(join(text_folder, f))]

for file in onlyfiles:
    f = open(text_folder + '/' + file, "r")
    text_dic[file] = {}
    text_dic[file]['name'] = file
    text_dic[file]['text'] = f.read()
    
#Using the modified version of clean_raw_text function, incorporating it into my own code, we build 
#the lists of tokenized texts for every speech
for index, data in text_dic.items():
    text_dic[index]['clean_text'] = text_dic[index]['text'].replace(" \'m", "'m") \
        .replace(" \'ll", "'ll").replace(" \'re", "'re") \
        .replace(" \'s", "'s").replace(" \'re", "'re")\
        .replace("\n", "").replace("\x0c", "")
    text_dic[index]['token_word_list'] = word_tokenize(text_dic[index]['clean_text'])

In [15]:
df_text = pd.DataFrame(text_dic)
df_text = df_text.T 


In [19]:
df_text['tokenized_sents'] = df_text['clean_text'].apply(lambda x: [lucem_illud_2020.word_tokenize(s) for s in lucem_illud_2020.sent_tokenize(x)])
df_text['normalized_sents'] = df_text['tokenized_sents'].apply(lambda x: [lucem_illud_2020.normalizeTokens(s) for s in x])

In [22]:
df_text['normalized_sents'][:5].sum()

[['doctor',
  'alejandro',
  'toledo',
  'manrique',
  'ante',
  'el',
  'congreso',
  'nacional',
  'el'],
 ['de',
  'julio',
  'de',
  'mensaje',
  'del',
  'presidente',
  'constitucional',
  'del',
  'perú',
  'señor',
  'ex',
  'presidente',
  'de',
  'la',
  'república',
  'doctor',
  'valentín',
  'paniagua'],
 ['señor',
  'presidente',
  'del',
  'congreso',
  'de',
  'la',
  'república',
  'señoras',
  'y',
  'señores'],
 ['congresistas',
  'de',
  'la',
  'nación',
  'peruanas',
  'y',
  'peruanos',
  'quiero',
  'en',
  'primer',
  'lugar',
  'reiterar'],
 ['mi', 'felicitación'],
 ['al', 'nuevo'],
 ['presidente', 'del', 'congreso', 'y', 'los', 'vicepresidentes', 'por'],
 ['su', 'elección'],
 ['democrática'],
 ['en'],
 ['un', 'proceso', 'que', 'todos', 'respaldamos'],
 ['una'],
 ['elección'],
 ['cuyo'],
 ['resultado', 'fue'],
 ['adverso',
  'para',
  'la',
  'alianza',
  'de',
  'gobierno',
  'aunque',
  'es',
  'evidente',
  'que',
  'al',
  'margen',
  'de',
  'ello',
  'la

In [24]:
g = wordCooccurrence(df_text['normalized_sents'][:100].sum())

In [25]:
len(g.nodes)

14106

In [26]:
len(g.edges)

274292

In [27]:
nx.to_numpy_matrix(g)[:10, :10]

matrix([[  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   2.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   3.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   2.,   3., 762.,   1.,   1.,   0.,   1.,   0.,   4.],
        [  0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   4.,   0.,   0.,   0.,   0.,   0.,   0.]])

In [29]:
def tokenize(text):
    tokenlist = lucem_illud_2020.word_tokenize(text)
    normalized = lucem_illud_2020.normalizeTokens(tokenlist)
    return normalized
import sklearn.feature_extraction



In [30]:
senVectorizer = sklearn.feature_extraction.text.CountVectorizer(tokenizer = tokenize)
senVects_incidence = senVectorizer.fit_transform(df_text['text'][:100])

In [32]:
senVects_incidence.shape

(19, 12932)

In [31]:
g_2mode = nx.Graph()

#define all the nodes
g_2mode.add_nodes_from((senVectorizer.get_feature_names()[i] for i in range(senVects_incidence.shape[1])), bipartite = 'word')
g_2mode.add_nodes_from(range(senVects_incidence.shape[0]), bipartite = 'doc')

#add all the edges
g_2mode.add_edges_from(((d, senVectorizer.get_feature_names()[w], {'weight' : senVects_incidence[d, w]}) for d, w in zip(*senVects_incidence.nonzero())))

In [33]:
print(nx.info(g_2mode))

Name: 
Type: Graph
Number of nodes: 12951
Number of edges: 41655
Average degree:   6.4327


In [None]:
nx.draw_networkx(g_2mode)

In [None]:
def contractNetwork(g, targetType):
    g_mono = nx.Graph()
    g_mono.add_nodes_from(((n, d) for n, d in g_2mode.nodes(data = True) if d['bipartite'] == targetType))
    
    for n_outside in (n for n, d in g_2mode.nodes(data = True) if d['bipartite'] != targetType):
        neighbors = list((n for n in g.neighbors(n_outside) if g.nodes[n]['bipartite'] == targetType))
        for i, n1 in enumerate(neighbors):
            for n2 in neighbors[i+1:]:
                try:
                    g_mono.edges[n1, n2]['weight'] += 1
                except KeyError:
                    g_mono.add_edge(n1, n2, weight = 1)
    return g_mono