In [1]:
##import the libraries
import pandas as pd 
import numpy as np
import os
import pickle
import networkx as nx
import itertools
from itertools import combinations

import statsmodels
import statsmodels.api as sm
import scipy.stats as stats

import matplotlib.pyplot as plt


**Load Clean Comments and create a data frame**

In [2]:
##load the cleaned comments and convert them to a data frame
#df['text_cleaned'] = pd.read_pickle(os.path.join(your_dir, 'comments_cleaned'))

clean_comments = pd.read_pickle('comments_cleaned')
df = pd.DataFrame(data = clean_comments)

##insert a column to indicate unique id
df['ID'] = range(0, 0+len(df))
cols = list(df.columns) ##column list
cols = [cols[-1]] + cols[:-1]  #make last column first
df=df[cols]
df.head()

Unnamed: 0,ID,text_clean
0,0,gente guerra portato casa rimpatriare
1,1,marcello perfavore
2,2,patria patriota difesa radice soccombere roma ...
3,3,musulmano comandare casa
4,4,odio dipendere odio comandare libro


In [3]:
##lets construct a matrix of the unique words
##first lets save to csv
df.to_csv("clean_comments.csv",header=True,index=False)

In [4]:
len(df.index) ##count of rows

78174

In [5]:
#for index, row in df.iterrows():
 # print(len(row["text_clean"])) ##eg lrn of row 1 = 158

 ## print(row["text_clean"]) ##we obtain the rows of column text_clean

  ##DataFrame.iterrows is a generator which yields both the index and row (as a Series):
##DataFrame.to_string() - to print the data frame

In [6]:
###create a copy of the original data frame
df_clean = df.copy()
df_clean.head()

Unnamed: 0,ID,text_clean
0,0,gente guerra portato casa rimpatriare
1,1,marcello perfavore
2,2,patria patriota difesa radice soccombere roma ...
3,3,musulmano comandare casa
4,4,odio dipendere odio comandare libro


In [7]:
words_per_comment = {}
for index, row in df_clean.iterrows():
  words_per_comment[row["ID"]]= [clean_txt for clean_txt in row["text_clean"]]

In [8]:
print(len(words_per_comment))

78174


**This tokenizes the cleaned text to words using NLTK**




In [9]:
##tokenize the text ie split the text to words 
##df_words['text_clean'].apply(lambda x: ' '.join(x)) - to join the words

import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
df_words = df.copy() ##create a copy of the original data frame
df_words['text_clean'] = df['text_clean'].apply(lambda s: tokenizer.tokenize(s))
df_words.head(20)

Unnamed: 0,ID,text_clean
0,0,"[gente, guerra, portato, casa, rimpatriare]"
1,1,"[marcello, perfavore]"
2,2,"[patria, patriota, difesa, radice, soccombere,..."
3,3,"[musulmano, comandare, casa]"
4,4,"[odio, dipendere, odio, comandare, libro]"
5,5,[razzista]
6,6,"[meditato, sinistronzi, meditato]"
7,7,"[tolleranza, rispettare, cultura, legge, barcone]"
8,8,[sottoscrivere]
9,9,"[papa, francesco, giudicare, sicuro, baciare, ..."


In [10]:
##how to get the unique words in a data frame
##this is an array of all the unique words in the clean comments

df_unique = df_words["text_clean"].apply(pd.Series).stack().unique()
df_unique.shape ## we have 43041 unique words

(42628,)

**Create Edges lists and Node list**

In [11]:
##try this way to visualize the unique words
## create a dictionary of unique words

unique_words = {}
words_per_comment = {}

for index, row in df_words.iterrows():  ##iterating over the df
  if(len(row["text_clean"])>0):  ##this gives the length of each row
    words_per_comment[row["ID"]]= [clean_txt for clean_txt in row["text_clean"]] ###iterating the column text_clean. There are 78174 rows
    for clean_txt in row["text_clean"]:
      unique_words.setdefault(clean_txt, 0) ## setdefault() method returns the value of a key (if the key is in dictionary)
      unique_words[clean_txt] += 1

#setdefault()
#Returns: 
#Value of the key if it is in the dictionary. 
#None if key is not in the dictionary and default_value is not specified. 
#default_value if key is not in the dictionary and default_value is specified.

In [12]:
print(len(unique_words))

42628


In [13]:
print(unique_words)

{'gente': 1323, 'guerra': 568, 'portato': 117, 'casa': 1545, 'rimpatriare': 65, 'marcello': 146, 'perfavore': 12, 'patria': 203, 'patriota': 25, 'difesa': 237, 'radice': 68, 'soccombere': 2, 'roma': 592, 'islamico': 189, 'accoltellare': 11, 'crocifisso': 44, 'collo': 25, 'accadere': 141, 'stazione': 37, 'termine': 149, 'aggressore': 6, 'fermare': 205, 'poliziotto': 69, 'notare': 153, 'vittima': 136, 'bus': 15, 'tentare': 88, 'sgozzarlo': 1, 'aumentare': 522, 'controllo': 51, 'attenzione': 158, 'prevenire': 13, 'violenza': 235, 'cittadino': 711, 'innocente': 85, 'imperativo': 4, 'corrado': 23, 'armeri': 2, 'fdi': 75, 'musulmano': 144, 'comandare': 181, 'odio': 381, 'dipendere': 155, 'libro': 290, 'razzista': 348, 'meditato': 26, 'sinistronzi': 21, 'tolleranza': 59, 'rispettare': 785, 'cultura': 279, 'legge': 1058, 'barcone': 76, 'sottoscrivere': 45, 'papa': 310, 'francesco': 250, 'giudicare': 107, 'sicuro': 417, 'baciare': 96, 'piede': 185, 'aguzzino': 6, 'occhio': 344, 'inghilterra': 1

In [14]:
##We can use this way to visualize matrix of unique words
##The words dont appear twice hence the value = 0/1

clean_df = pd.DataFrame(0, index=unique_words, columns=unique_words)

for key in words_per_comment:
  for pair in itertools.product(words_per_comment[key],words_per_comment[key]):
    if pair[0]!=pair[1] and not(clean_df.at[pair[0],pair[1]]):
      clean_df.at[pair[0],pair[1]] += 1
      clean_df.at[pair[1],pair[0]] += 1

In [15]:
display(clean_df) ##this is the full matrix of

Unnamed: 0,gente,guerra,portato,casa,rimpatriare,marcello,perfavore,patria,patriota,difesa,...,quellivche,ndrine,earth,xab,x88,x91,satanasso,parlamentari,consigliatissima,fdo
gente,0,1,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
guerra,1,0,1,1,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
portato,1,1,0,1,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
casa,1,1,1,0,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
rimpatriare,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
x91,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
satanasso,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
parlamentari,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
consigliatissima,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
##use this as a basis to get edges and their weights
words_projection = {} ##create dictionary 
##itertools.product() which computes the cartesian product of input iterables.

for key in words_per_comment:
    for pair in itertools.product(words_per_comment[key],words_per_comment[key]):
        if pair[0]!=pair[1] and not(pair[::-1] in words_projection):
            words_projection.setdefault(pair,0)
            words_projection[pair] += 1

words_projection

{('gente', 'guerra'): 11,
 ('gente', 'portato'): 7,
 ('gente', 'casa'): 75,
 ('gente', 'rimpatriare'): 4,
 ('guerra', 'portato'): 9,
 ('guerra', 'casa'): 38,
 ('guerra', 'rimpatriare'): 3,
 ('portato', 'casa'): 16,
 ('portato', 'rimpatriare'): 2,
 ('casa', 'rimpatriare'): 4,
 ('marcello', 'perfavore'): 1,
 ('patria', 'patriota'): 2,
 ('patria', 'difesa'): 6,
 ('patria', 'radice'): 2,
 ('patria', 'soccombere'): 1,
 ('patria', 'roma'): 5,
 ('patria', 'islamico'): 8,
 ('patria', 'accoltellare'): 1,
 ('patria', 'crocifisso'): 1,
 ('patria', 'collo'): 1,
 ('patria', 'accadere'): 5,
 ('patria', 'stazione'): 1,
 ('patria', 'termine'): 8,
 ('patria', 'aggressore'): 1,
 ('patria', 'fermare'): 3,
 ('patria', 'poliziotto'): 1,
 ('patria', 'notare'): 3,
 ('patria', 'vittima'): 4,
 ('patria', 'bus'): 1,
 ('patria', 'tentare'): 1,
 ('patria', 'sgozzarlo'): 1,
 ('patria', 'aumentare'): 5,
 ('patria', 'controllo'): 1,
 ('patria', 'attenzione'): 1,
 ('patria', 'prevenire'): 1,
 ('patria', 'violenza'): 

In [17]:
##lets obtain the nodes list and edge lists
#UNWEIGHTED
G = nx.Graph()
G.add_edges_from(words_projection)
print(G.edges)
print(G.nodes)

nx.write_edgelist(G,"commentsunweighted_edgelist.csv",delimiter=",")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [18]:
!sed -i.bak 1i"Source,Target,Weight" commentsunweighted_edgelist.csv

sed: 1: "1iSource,Target,Weight": command i expects \ followed by text


**Create an Edges List**

In [19]:
##edge lists
#WEIGHTED
#to get weighted graph we need a list of 3-element tuplels (u,v,w) where u and v are nodes 
## and w is a number representing weight

words_weighted = []
for edge in words_projection:
    words_weighted.append((edge[0],edge[1],words_projection[edge]))

G = nx.Graph()
G.add_weighted_edges_from(words_weighted)

print(G.edges)
print(G.nodes)

nx.write_weighted_edgelist(G, "words_edgelist_weighted.csv",delimiter=",") ##save the edges list as csv

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [20]:
!sed -i.bak 1i"Source,Target,Weight" words_edgelist_weighted.csv ##insert columns source, target, weight to the node list

sed: 1: "1iSource,Target,Weight": command i expects \ followed by text


In [21]:
##Create edges from Pandas data frame

G_df = nx.from_pandas_adjacency(clean_df)

print(G_df.edges)
print(G_df.nodes)
nx.write_weighted_edgelist(G, "words_edgelist_2.csv",delimiter=",")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [22]:
!sed -i.bak 1i"Source,Target,Weight" words_edgelist_2.csv

sed: 1: "1iSource,Target,Weight": command i expects \ followed by text


In [23]:
##lets create a node list
nl_df = pd.DataFrame.from_dict(unique_words,orient="index")
nl_df.reset_index(inplace=True)
nl_df[0] = nl_df['index']
nl_df.rename(columns={"index":"Id", 0:"Label"},inplace=True)


nl_df.to_csv("words_nodelist.csv",index=False)

In [None]:
###We now have nodes list and edges list
##We can now use the weighted edge list and node list to create graphs in Gephi