In [1]:
# import basic libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import os 
stemmer = PorterStemmer()

# function to clean text
def review_to_words(raw_review):
    
    # 1. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    
    # 3. Remove Stopwords. In Python, searching a set is much faster than searching a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    
    # 4. Remove stop words
    meaningful_words = [w for w in words if not w in stops]  #returns a list 

    # 5. Stem words. Need to define porter stemmer above
    singles = [stemmer.stem(word) for word in meaningful_words]
    
    # 6. Join the words back into one string separated by space, and return the result.
    return( " ".join( singles ))

In [2]:
path="C:/Users/adsieg/Desktop/BNP/external_data"
os.chdir(path)

data_orx = pd.read_csv('ORX_news_clean.csv')
data_sas = pd.read_csv('SAS_clean.csv')

text_orx = data_orx['Digest text']
text_sas = data_sas['Description of Event']

text_total = pd.concat([text_orx, text_sas])
text_total = text_total.reset_index(drop=True)

In [3]:
text_total

0        On 1 July 2014, BNP pleaded guilty to falsifyi...
1        On 25 January 2016, JPMorgan agreed to a USD 1...
2        A DEEP DIVE IS NOW AVAILABLE FOR THIS LOSS EVE...
3        A DEEP DIVE IS NOW AVAILABLE FOR THIS LOSS EVE...
4        A DEEP DIVE IS NOW AVAILABLE FOR THIS LOSS EVE...
5        On 20 October 2015, La Tribune announced that ...
6        Western Union will pay USD 651 million (EUR) a...
7        ING Bank N.V. is to pay USD 619 million (EUR 4...
8        The Bank of Tokyo-Mitsubishi UFJ has been fine...
9        Credit Suisse has agreed to forfeit USD 536 mi...
10       On 26 January 2018, the Japanese cryptocurrenc...
11       The former ABN AMRO Bank, now the Royal Bank o...
12       On 28 February 2014 MT Gox filed for bankruptc...
13       In February 2018, the US Office of the Comptro...
14       Lloyds TSB Bank has agreed to forfeit USD 350 ...
15       Barclays has paid a total of USD 300.5 million...
16       Deutsche Bank is to pay a USD 258 million (EUR.

In [5]:
total = str(text_total)

In [7]:
total2 = str.join(text_total)

TypeError: descriptor 'join' requires a 'str' object but received a 'Series'

In [4]:
# apply it to our text data 
# dataset is named wine_data and the text are in the column "wmn"
processed_wmn = [ review_to_words(text) for text in text_total]

processed_wmn[:3]

['juli bnp plead guilti falsifi busi record conspiraci believ bnp plead guilti feder court conspiraci juli june announc bnp pariba reach usd billion eur billion court settlement us author us econom sanction violat may us judg sentenc bnp five year probat settlement relat usd billion transact conduct switzerland process countri us sanction sanction countri includ iran sudan cuba accord reuter bnp abl avoid sanction relat transfer iran cuba strip inform wire transfer allow pass us system without rais flag sudan bnp admit set elabor payment structur rout transact satellit bank disguis origin reuter report prior reach settlement bnp implement new complianc control procedur mani place time settlement includ implement new depart known group financi secur us ensur bnp global compli us regul regard intern sanction embargo ensur usd flow within bnp process control via new york branch februari bnp pariba announc set asid usd billion provis cover expect legal cost provis prompt bnp report lowest 

In [None]:
################## Word2Vec

In [None]:
# build a corpus for the word2vec model
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for sentence in data:
        word_list = sentence.split(" ")
        corpus.append(word_list)    
           
    return corpus

corpus = build_corpus(processed_wmn) 

In [None]:
# load the word2vec algorithm from the gensim library
from gensim.models import word2vec
# run the model
model = word2vec.Word2Vec(corpus, size=100, window=5, min_count=1000, workers=4)

In [None]:
[x for x in model.wv.vocab][0:5]

In [None]:
[(item[0],round(item[1],2)) for item in model.most_similar('fraud')]

In [None]:
############################### Node2Vec

In [None]:
import networkx as nx
import string
import pandas as pd
from sklearn.manifold import TSNE
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from node2vec import Node2Vec
import matplotlib.pyplot as plt
import networkx

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import gensim

In [None]:
from functools import reduce
wordList2 = reduce(lambda x,y: x+y,corpus)

In [None]:
wordList2

In [None]:
dG = nx.DiGraph()

for i, word in enumerate(wordList2):
    try:
        next_word = wordList2[i + 1]
        if not dG.has_node(word):
            dG.add_node(word)
            dG.node[word]['count'] = 1
        else:
            dG.node[word]['count'] += 1
        if not dG.has_node(next_word):
            dG.add_node(next_word)
            dG.node[next_word]['count'] = 0

        if not dG.has_edge(word, next_word):
            dG.add_edge(word, next_word, weight=float("inf") - 1)
        else:
            dG.adj[word][next_word]['weight'] -= 1
    except IndexError:
        if not dG.has_node(word):
            dG.add_node(word)
            dG.node[word]['count'] = 1
        else:
            dG.node[word]['count'] += 1
    except:
        raise

In [None]:
for node in dG.nodes():
    print('%s:%d\n' % (node, dG.node[node]['count']))

In [None]:
for edge in dG.edges():
    print(edge)

In [None]:
node2vec = Node2Vec(dG, dimensions=20, walk_length=16, num_walks=100, workers=2)

In [None]:
model = node2vec.fit(window=10, min_count=1)

for node, _ in model.most_similar('tennis'):
    # Show only players
    #if len(node) > 3:
    print(node)