# Complex Networks

Student: Hossein Ebrahimpour

ID: 98723249

Homework 1a - **Question 6**

In [172]:
import networkx as nx
from hazm import *
import matplotlib as plt
import string

def replaceLetters(text):
    return text.replace("آ", "ا").replace("...", "").replace("…", "")

def removeSymbols(items):
    symbols = list("«».?!؟+-()!*&:–-،,؛\"'^%$#@/<>_")
    return list(filter(lambda x: x not in symbols, items))

def removeStopwords(items):
    stops = stopwords_list() + ["http", "https", "twitter"] + ["این", "ان"]
    return list(filter(lambda x: x not in stops, items))

def makeGraph(text, clean = False, drawGraph = False):
    text = Normalizer().normalize(text)
    text = replaceLetters(text)
    tokenizer = WordTokenizer(join_verb_parts=False, replace_links=True)
    words = tokenizer.tokenize(text)
    if(clean):
        # stemmer = Stemmer()
        # lemmetizer = Lemmatizer()
        #words = list(map(lambda x: stemmer.stem(x), words))
        words = removeStopwords(words)
    
    words = removeSymbols(words)
    g = nx.Graph().to_directed()
    for i, word in enumerate(words):
        if(i + 1 == len(words)):
            break;
        nextWord = words[i+1]
        if((word, nextWord) in g.edges):
            edge = g.edges[word, nextWord]
            wi = edge['weight']
            g.edges[word, nextWord]['weight'] = wi + 1
        else:
            g.add_weighted_edges_from([(word, words[i+1], 1)])
    
    if(drawGraph):
        nx.draw(g, with_labels=True)
        
    return (g)

def keyPhraseList(wordsGraph):
    weights = wordsGraph.edges.data('weight')
    return list(sorted(weights, key=lambda x: x[2], reverse=True))

def debugKeyPhraseList(wordsGraph):
    top10 = list(keyPhraseList(wordsGraph))[0:15]
    for (w1, w2, wi) in top10:
        print(f"# Phrase '{w1} {w2}' with weight {wi}")

def addToStartMatches(parts, term, toAdd):
    success = False
    for words in parts:
        if words[0] == term:
            words.insert(0, toAdd)
            success = True
    return success

def addToEndMatches(parts, term, toAdd):
    success = False
    for words in parts:
        if words[-1] == term:
            words.append(toAdd)
            success = True
    return success

def keyPhrase(wordsGraph, input_threshould=6, output_threshould=3):
    keyList = keyPhraseList(wordsGraph)
    phraseParts = list() # list of lists
    for (w1, w2, wi) in keyList[0:input_threshould]:
        success1 = addToEndMatches(phraseParts, w1, w2)
        success2 = False
        if not success1:
            success2 = addToStartMatches(phraseParts, w2, w1)
        if not success1 and not success2:
            phraseParts.append([w1, w2])
    
    answerCanditates = list(filter(lambda x: len(x) > output_threshould, phraseParts))
    if(len(answerCanditates) == 0):
        answerCanditates = phraseParts
    return " ".join(list(map(lambda x: " ".join(x), answerCanditates)))

In [5]:
import csv

data1_path = "t012010.csv"
data2_path = "t022020.csv"

# load data of first csv
tweets1_text = ""
with open(data1_path, newline='', encoding='utf-8') as csvfile1:
    tweet_reader = csv.reader(csvfile1)
    for row in tweet_reader:
        this_tweet = row[1]
        tweets1_text = tweets1_text + this_tweet

print(f"Tweets of data {data1_path} loaded.")
        
# load data of seconds csv
tweets2_text = ""
with open(data2_path, newline='', encoding='utf-8') as csvfile2:
    tweet_reader = csv.reader(csvfile2)
    for row in tweet_reader:
        try:
            this_tweet = row[1]
            tweets2_text = tweets2_text + this_tweet
        except:
            pass
        
print(f"Tweets of data {data2_path} loaded.")

Tweets of data t012010.csv loaded.
Tweets of data t022020.csv loaded.


In [53]:
rawGraph1 = makeGraph(tweets1_text)
print("Created graph of tweets of data1 without cleaning stopwords.")
rawGraph2 = makeGraph(tweets2_text)
print("Created graph of tweets of data2 without cleaning stopwords.")

Created graph of tweets of data1 without cleaning stopwords.
Created graph of tweets of data2 without cleaning stopwords.


In [54]:
cleansedGraph1 = makeGraph(tweets1_text, clean=True)
print("Created graph of tweets of data1, cleaned stopwords.")
cleansedGraph2 = makeGraph(tweets2_text, clean=True)
print("Created graph of tweets of data2, cleaned stopwords.")

Created graph of tweets of data1, cleaned stopwords.
Created graph of tweets of data2, cleaned stopwords.


## Keyphrases without removing stop-words

### Data 1

In [183]:
key = keyPhrase(rawGraph1)
print(f"Key phrase: {key}")

Key phrase: محاسباتی و اینده نگری و


### Data 2

In [182]:
key = keyPhrase(rawGraph2)
print(f"عبارت: {key}")

عبارت: اصل عدم قطعیت هایزنبرگ


## Key phrases using stopwords removal

### Data 1

In [181]:
key = keyPhrase(cleansedGraph1)
print(f"عبارت: {key}")

عبارت: اینده نگری پز قدرت محاسباتی محاسباتی اینده‌نگری امریکایی‌ها امریکا ضعف


In [180]:
key = keyPhrase(cleansedGraph2)
print(f"عبارت: {key}")

# Hazm removes "عدم" because it is in its stopwords list

عبارت: اصل قطعیت هایزنبرگ براورد قطعیت استاندارد شرایط قطعیت فیزیک کوانتوم
