In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from apyori import apriori
import networkx as nx 

import seaborn as sns
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

# Wikipedia

In [16]:
df['text'][0]

'employment right woman include non discriminatory access woman job equal pay right woman men equal pay equal benefit equal work openly denied british hong kong government early 1970s leslie wah leung chung 1917 2009 president hong kong chinese civil servant association 1965 68 contributed establishment equal pay men woman including right married woman permanent employee job status woman changed permanent employee temporary employee married thus losing pension benefit even lost job since nurse mostly woman improvement right married woman meant much nursing profession european country married woman could work without consent husband decade ago example france 1965 spain 1975 addition marriage bar practice adopted late 19th century 1970s across many country including austria australia ireland canada switzerland restricted married woman employment many profession key issue towards insuring gender equality workplace respecting maternity right reproductive right woman maternity leave paterni

In [None]:
#RELOAD FILE AND PRETEND THAT IS OUR STARTING POINT 
df=pd.read_csv('./data/00-raw-data/wiki-crawl-results.csv')  
corpus=df["text"].to_list()
print(df.shape)

from nltk.corpus import stopwords

#USER PARAM
input_path = 'DRACULA.txt'
compute_sentiment =	True		
sentiment =	[]			#average sentiment of each chunk of text 
ave_window_size	= 250		#size of scanning window for moving average
					

#OUTPUT FILE
output='transactions.txt'
if os.path.exists(output): 
    os.remove(output)

#INITIALIZE
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
sia = SentimentIntensityAnalyzer()

#ADD MORE
stopwords = stopwords.words('english')
add=['mr','mrs','wa','dr','said','back','could','one','looked','like','know','around','dont']
for sp in add: 
    stopwords.append(sp)

def read_and_clean(corpus,START=0,STOP=-1):
    global sentiment
    
    text = ' '.join(corpus)
    
	#REMOVE HEADER, AND NEW LINES
    text=text.replace("'",'') #wasn't --> wasnt
    lines = text.splitlines(); text='';
    lines=lines[START:STOP] 
    for line in lines: text=text+' '+line

	#-----------------------
	#INSERT CODE TO ONLY KEEP CHAR IN string.printable
	#-----------------------
    tmp = ''
    printable = set(string.printable)
    for char in text:
        if char in printable:
            tmp = tmp + char
    text = tmp
  
	#BREAK INTO CHUNKS (SENTENCES OR OTHERWISE)
    sentences=nltk.tokenize.sent_tokenize(text)  #SENTENCES
    
    print("NUMBER OF SENTENCES FOUND:",len(sentences)); #print(sentences)

	#CLEAN AND LEMMATIZE
    keep='0123456789abcdefghijklmnopqrstuvwxy';
    
    new_sentences=[]; vocabulary=[]
    for sentence in sentences:
        new_sentence=''

		# REBUILD LEMITIZED SENTENCE
        for word in sentence.split():
			
			#ONLY KEEP CHAR IN "keep"
            tmp2=''
            for char in word: 
                if(char in keep): 
                    tmp2=tmp2+char
                else:
                    tmp2=tmp2+' '
            word=tmp2

			#-----------------------
			# INSERT CODE TO LEMMATIZE THE WORDS
			#-----------------------
            new_word = lemmatizer.lemmatize(word)

			#REMOVE WHITE SPACES
            new_word=new_word.replace(' ', '')

			#BUILD NEW SENTENCE BACK UP
            if( new_word not in stopwords):
                if(new_sentence==''):
                    new_sentence=new_word
                else:
                    new_sentence=new_sentence+','+new_word
                if(new_word not in vocabulary): vocabulary.append(new_word)

		#SAVE (LIST OF LISTS)		
        new_sentences.append(new_sentence.split(","))
		
    #SIA
    if(compute_sentiment):
        #-----------------------
        # INSERT CODE TO USE NLTK TO DO SENTIMENT ANALYSIS 
        #-----------------------
        for sentence in new_sentences:
            score = sia.polarity_scores(" ".join(sentence))
            sentiment_scores = [score['neg'],score['neu'],score['pos'],score['compound']]
            sentiment.append(sentiment_scores)
	
    sentiment=np.array(sentiment)
    print("TOTAL AVERAGE SENTEMENT:",np.mean(sentiment,axis=0))
    print("VOCAB LENGTH",len(vocabulary))
    return new_sentences

transactions=read_and_clean(corpus,400,-400)
print(transactions[0:5])

In [None]:
corpus

In [None]:
# INSERT CODE TO RE-FORMAT THE APRIORI OUTPUT INTO A PANDAS DATA-FRAME WITH COLUMNS "rhs","lhs","supp","conf","supp x conf","lift"
def reformat_results(results):

    #CLEAN-UP RESULTS 
    keep=[]
    for i in range(0,len(results)):
        # print("=====================================")
        # print(results[i])
        # print(len(list(results[i])))
        for j in range(0,len(list(results[i]))):
            # print(results)
            if(j>1):
                for k in range(0,len(list(results[i][j]))):
                    if(len(results[i][j][k][0])!=0):
                        #print(len(results[i][j][k][0]),results[i][j][k][0])
                        rhs=list(results[i][j][k][0])
                        lhs=list(results[i][j][k][1])
                        conf=float(results[i][j][k][2])
                        lift=float(results[i][j][k][3])
                        keep.append([rhs,lhs,supp,conf,supp*conf,lift])
                        # keep.append()
            if(j==1):
                supp=results[i][j]

    return pd.DataFrame(keep, columns =["rhs","lhs","supp","conf","supp x conf","lift"])

In [None]:
def convert_to_network(df):
    print(df)

    #BUILD GRAPH
    G = nx.DiGraph()  # DIRECTED
    for row in df.iterrows():
        # for column in df.columns:
        lhs="_".join(row[1][0])
        rhs="_".join(row[1][1])
        conf=row[1][3]; #print(conf)
        if(lhs not in G.nodes): 
            G.add_node(lhs)
        if(rhs not in G.nodes): 
            G.add_node(rhs)

        edge=(lhs,rhs)
        if edge not in G.edges:
            G.add_edge(lhs, rhs, weight=conf)

    # print(G.nodes)
    # print(G.edges)
    return G

In [None]:
def plot_network(G):
    #SPECIFIY X-Y POSITIONS FOR PLOTTING
    pos=nx.random_layout(G)

    #GENERATE PLOT
    fig, ax = plt.subplots()
    fig.set_size_inches(15, 15)

    #assign colors based on attributes
    weights_e 	= [G[u][v]['weight'] for u,v in G.edges()]

    #SAMPLE CMAP FOR COLORS 
    cmap=plt.cm.get_cmap('Blues')
    colors_e 	= [cmap(G[u][v]['weight']*10) for u,v in G.edges()]

    #PLOT
    nx.draw(
    G,
    edgecolors="black",
    edge_color=colors_e,
    node_size=2000,
    linewidths=2,
    font_size=8,
    font_color="white",
    font_weight="bold",
    width=weights_e,
    with_labels=True,
    pos=pos,
    ax=ax
    )
    ax.set(title='Dracula')
    plt.show()

# raise

# Tweets