In [None]:
import os
os.environ.setdefault('JAVA_HOME', '/usr/lib/jvm/java-1.8.0-openjdk-amd64')
import networkx as nx
import community
import matplotlib.pyplot as plt
import statistics
import wikipediaapi
import pickle
import pyspark.ml

from pyspark.sql import SparkSession
from neo4j import GraphDatabase, Driver

Importing graph

In [None]:
path = "peaks_graph_20190901_20190915.gexf"
g = nx.read_gexf(path, node_type=None, relabel=True).to_undirected()

Verification

In [None]:
print('number of nodes : ' + str(len(g)))
fig, ax = plt.subplots(figsize=(30, 20)) # set size
nx.draw(g, with_labels=True)
plt.show()

Degree computation + mean + std + threshold

In [None]:
nodes_with_degrees = g.degree

mean_deg = statistics.mean(l[1] for l in nodes_with_degrees)

std_deg = statistics.stdev(l[1] for l in nodes_with_degrees)

Filtering extremely highly connected nodes

In [None]:
threshold = mean_deg + std_deg*std_deg/2 

nodes_to_remove = list(filter(lambda d : d[1] > threshold, nodes_with_degrees))

n,d = zip(*nodes_to_remove)

g.remove_nodes_from(n)
print('remaining nodes : ' + str(len(g)))

Degree computation + filtering isolated nodes

In [None]:
nodes_with_degrees = g.degree

nodes_to_remove = list(filter(lambda d : d[1] == 0, nodes_with_degrees))

n,d = zip(*nodes_to_remove)

g.remove_nodes_from(n)

print('remaining nodes : ' + str(len(g)))

Keeping only largest connected component

In [None]:
gcc = max(nx.connected_component_subgraphs(g), key=len)

print('remaining nodes : ' + str(len(gcc)))

Verification

In [None]:
fig, ax = plt.subplots(figsize=(100, 70)) # set size
nx.draw(gcc, with_labels=True)
plt.show()

Partitioning using Louvain

In [None]:
louvain_communities = community.best_partition(gcc, resolution=1)


louvain_communities_dict = {}

for key, value in sorted(louvain_communities.items()):
    louvain_communities_dict.setdefault(value, []).append(key)

print(len(louvain_communities_dict))
print(louvain_communities_dict[0])
print(sorted(louvain_communities_dict))

Partitioning using Leiden


https://pypi.org/project/leidenalg/
https://www.nature.com/articles/s41598-019-41695-z

categoriy of each partition

In [None]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=('',''))

helpers

In [None]:
#get list of categories of a page
def get_categories(page_name):
    c = list()
    with driver.session() as session:
        with session.begin_transaction() as tx:
            for record in tx.run("MATCH (p:Page)-[:BELONGS_TO]->(c:Category) "
                                 "WHERE p.title = {page_name} "
                                 "AND NOT exists((c)-[:BELONGS_TO]->(:Category {title: \'Hidden_categories\'})) "
                                 "RETURN c.title", 
                                 page_name = page_name ):
                #print(record["c.title"])
                c.append(record["c.title"])
    return c

#map each element to frequency in a list    
def count_frequency(my_list): 
      
    # Creating an empty dictionary  
    freq = {} 
    for items in my_list: 
        freq[items] = my_list.count(items)
    return freq

#iterate over pages dict partition
def part_category_fetch(key, dic):
    cat = []
    for title in dic[key]:
        cat += get_categories(title)
    print('done fetching')
    return cat

def fetcher(bpd):
    part_cat = {}
    
    for part in reversed(sorted(bpd)):
        print(part)
        cat = part_category_fetch(part, bpd)
        print(cat)
        part_cat.setdefault(part, cat)
    
    return part_cat

Fetch categories for each cluster

In [None]:
def fetch_categories(bpd):
    part_cat_dict = fetcher(bpd)
    
    return part_cat_dict

In [None]:
part_cat_dict = fetch_categories(louvain_communities_dict)

#with open('clusters to categoties.pickle', 'rb') as handle:
#    d = pickle.load(handle)

In [None]:
print(part_cat_dict[len(part_cat_dict)-1])

Compute frequency of category in each part and find max freq cat for each part

In [None]:
part_cat_dict_freq = {}
for e in part_cat_dict:
    cat_map_freq = count_frequency(part_cat_dict[e])
    part_cat_dict_freq.setdefault(e, cat_map_freq)
    
    
max_part_cat = {}
for e in part_cat_dict_freq:
    ls = list(part_cat_dict_freq[e].keys())
    cat = ls[0]
    for x in ls:
        if part_cat_dict_freq[e][cat] < part_cat_dict_freq[e][x]:
            cat = x
    max_part_cat.setdefault(e, cat)

In [None]:
for e in max_part_cat:
    print(e, max_part_cat[e])

maybe use TF-IDF between all possible categories for each cluster ?

In [None]:
spark = SparkSession.builder.appName('LDA').config("spark.master", "local[*]").config("spark.sql.warehouse.dir", "/home/ayman/warehouse").getOrCreate()

In [None]:
' '.join(part_cat_dict[len(part_cat_dict) - 1])

In [None]:
df_ = []
for p in part_cat_dict:
    df_.append((p, ' '.join(part_cat_dict[p]).replace('_', ' ').replace(',','').replace('\\\'', ' ').lower()))
        
for e in df_:
    print(e)

In [None]:
partitionsData = spark.createDataFrame(df_, ['cluster', 'categories'])

In [None]:
print(partitionsData)

In [None]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, CountVectorizer

In [None]:
tokenizer = Tokenizer(inputCol="categories", outputCol="raw")
wordsDirtyData = tokenizer.transform(partitionsData)
remover = StopWordsRemover(inputCol="raw", outputCol="words")
wordsData = remover.transform(wordsDirtyData)
print(wordsData)

In [None]:
print(wordsData.select('raw').first())
print(wordsData.select('words').first())

In [None]:
#hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
#featurizedData = hashingTF.transform(wordsData)
#print(featurizedData)

In [None]:
countVectorizer = CountVectorizer(inputCol="words", outputCol="rawFeatures")
cvmodel = countVectorizer.fit(wordsData)
featurizedData = cvmodel.transform(wordsData)
print(featurizedData)

In [None]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
print(rescaledData)

In [None]:
dataset = rescaledData.select('cluster', 'features')
print(dataset)
dataset.show(truncate=False)

In [None]:
from pyspark.ml.clustering import LDA

In [None]:
# Trains a LDA model.
lda = LDA(k=len(part_cat_dict), maxIter=100)
ldaModel = lda.fit(dataset)

In [None]:
ll = ldaModel.logLikelihood(dataset)
lp = ldaModel.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = ldaModel.describeTopics(5)
print("The topics described by their top-weighted terms:")
topics.show()

# Shows the result
transformed = ldaModel.transform(dataset)
transformed.show()

Verification for one cluster:
getting the full row, getting the topic index and the corresponding topic informations

In [None]:
l = transformed.select('topicDistribution').first()[0]
print(transformed.first())
m = list(l).index(max(l))
print('\ntopic index is :',m)
print(topics.take(m+1)[m])

Printing the 5 most relevant words and their weights for each topic. 

In [None]:
topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 5)
vocabList = cvmodel.vocabulary

for i,t,w in topicIndices.collect():
    print('Topic %d:' % i)
    for j in range(len(t)):
        print('\t', vocabList[t[j]], w[j])

https://stackoverflow.com/questions/44233862/visualizing-topics-with-spark-lda

adding weights to lda input