# Exploring Financial News using N-grams and Stop Words

In [1]:
#import libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
import string

## Import Data

In [2]:
#load news data
data = pd.read_excel('C:/Users/beyku/CPT346 Assignment 1/data.csv', quotechar="'")

In [3]:
data.head(5)

Unnamed: 0,Date,News,Source
0,2007-12-15,Credit market strains hang on banks results\n\...,https://www.reuters.com/article/us-investment-...
1,2007-12-15,Dollar likely to weather any U.S. recession\n\...,https://www.reuters.com/article/us-investment-...
2,2007-12-14,Central bank plan delays day of reckoning\n\nN...,https://www.reuters.com/article/us-investment-...
3,2007-12-14,Inflation seen hanging over 2008 world economy...,https://www.reuters.com/article/us-investment-...
4,2007-12-14,Investors see housing as top U.S. election the...,https://www.reuters.com/article/us-investment-...


In [4]:
#extract only news content
news = data['News']
news.head(5)

0    Credit market strains hang on banks results\n\...
1    Dollar likely to weather any U.S. recession\n\...
2    Central bank plan delays day of reckoning\n\nN...
3    Inflation seen hanging over 2008 world economy...
4    Investors see housing as top U.S. election the...
Name: News, dtype: object

## Data Preprocessing

### Remove Non-ASCII Characters 

In [5]:
#function to remove non-ascii character
def remove_non_ascii(string):
    non_ascii_removed = ""
    for char in string:
        if ord(char) < 128:
            non_ascii_removed = non_ascii_removed + char
    return non_ascii_removed

In [6]:
news = news.astype(str)

In [7]:
#remove non-ascii character
for i, new in enumerate(news):
    news[i] = remove_non_ascii(new)

In [8]:
news.head()

0    Credit market strains hang on banks results\n\...
1    Dollar likely to weather any U.S. recession\n\...
2    Central bank plan delays day of reckoning\n\nN...
3    Inflation seen hanging over 2008 world economy...
4    Investors see housing as top U.S. election the...
Name: News, dtype: object

### Replace Punctuations, New Line, Carriage Return and Tab with White Space 

In [9]:
#remove punctuations, newline etc with white space
regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
for i, new in enumerate(news):
    news[i] = regex.sub(" ", str(new))

In [10]:
news.head(5)

0    Credit market strains hang on banks results  N...
1    Dollar likely to weather any U S  recession  N...
2    Central bank plan delays day of reckoning  NEW...
3    Inflation seen hanging over 2008 world economy...
4    Investors see housing as top U S  election the...
Name: News, dtype: object

### Tokenization & Lemmatization

In [11]:
#load spacy egnlish model
nlp = spacy.load('en_core_web_sm')

In [12]:
#use loaded english model to tokenize the news into words
#then use the tokenized words to get their baseforms or lemma to replace them
for i, new in enumerate(news):    
    doc = nlp(new, disable=['parser','ner'])
    news[i] = [token.lemma_ for token in doc]

In [13]:
news.head(5)

0    [credit, market, strain, hang, on, bank, resul...
1    [dollar, likely, to, weather, any, U, S,  , re...
2    [central, bank, plan, delay, day, of, reckon, ...
3    [inflation, see, hang, over, 2008, world, econ...
4    [investor, see, housing, as, top, U, S,  , ele...
Name: News, dtype: object

### Lowercase All Words

In [14]:
#lowercase all words
for i, new in enumerate(news):
    lowercase_new = []
    for word in new:
        lowercase_new.append(word.lower())
    news[i] = lowercase_new

In [15]:
news.head()

0    [credit, market, strain, hang, on, bank, resul...
1    [dollar, likely, to, weather, any, u, s,  , re...
2    [central, bank, plan, delay, day, of, reckon, ...
3    [inflation, see, hang, over, 2008, world, econ...
4    [investor, see, housing, as, top, u, s,  , ele...
Name: News, dtype: object

In [16]:
#build a list to store all the tokens
token_list = []
for new in news:
    for word in new:
        token_list.append(word)

In [17]:
token_list[:100]

['credit',
 'market',
 'strain',
 'hang',
 'on',
 'bank',
 'result',
 ' ',
 'new',
 'york',
 ' ',
 'reuters',
 '   ',
 'credit',
 'market',
 'strain',
 'may',
 'start',
 'to',
 'abate',
 'in',
 'early',
 '2008',
 'once',
 'bank',
 'get',
 'through',
 'the',
 'next',
 'round',
 'of',
 'reveal',
 'loss',
 'from',
 'risky',
 'asset',
 ' ',
 'fund',
 'manager',
 'at',
 'the',
 'reuters',
 'investment',
 'outlook',
 '2008',
 'summit',
 'say',
 'this',
 'week',
 '  ',
 'central',
 'bank',
 'plan',
 ' ',
 'unveil',
 'on',
 'wednesday',
 ' ',
 'to',
 'add',
 'temporary',
 'reserve',
 'may',
 'also',
 'help',
 'to',
 'cushion',
 'the',
 'global',
 'banking',
 'system',
 'from',
 'shock',
 'of',
 'more',
 'loss',
 'from',
 'the',
 'melt',
 'down',
 'in',
 'u',
 's',
 ' ',
 'subprime',
 'mortgage',
 ' ',
 'but',
 'for',
 'elevated',
 'short',
 'term',
 'borrowing',
 'cost',
 'to',
 'come',
 'down',
 'substantially',
 ' ',
 'market']

## Build N-grams

### Build Bigrams

In [18]:
bigrams = nltk.collocations.BigramAssocMeasures()
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(token_list)

### Build Trigrams

In [19]:
trigrams = nltk.collocations.TrigramAssocMeasures()
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(token_list)

## Text Analysis 

### Bigrams Frequency/Count Table

In [20]:
bigram_count = bigramFinder.ngram_fd.items()
bigramCountTable = pd.DataFrame(list(bigram_count), columns=['Bigram','Count']).sort_values(by='Count', ascending=False)

In [21]:
#Top 10 count of bigrams
bigramCountTable[:20]

Unnamed: 0,Bigram,Count
192,"(of, the)",125
327,"(in, the)",122
119,"( , the)",109
484,"( , the)",89
346,"( , and)",79
1279,"(the, fed)",59
79,"(s, )",50
171,"( , say)",48
78,"(u, s)",46
432,"(have, be)",46


### Filter Bigrams Frequency/Count Table using whitespace, Stop Words and Part-of-Speech tags

In [22]:
#get english stop words
stopwords = set(stopwords.words('english'))

In [23]:
#function to filter for bigrams using whitspace, stop words and pos tags
def bigram_filter(bigram):
    if bigram[0].isspace() or bigram[1].isspace():
        return False
    for word in bigram:
        if word in stopwords:
            return False
    first_word_tag = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_word_tag = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(bigram)
    if tags[0][1] in first_word_tag and tags[1][1] in second_word_tag:
        return True
    else:
        return False

In [24]:
#filter bigrams
filtered_bigram_count = bigramCountTable[bigramCountTable.Bigram.map(lambda x: bigram_filter(x))]

In [25]:
#top 10 filtered bigrams
filtered_bigram_count[:20]

Unnamed: 0,Bigram,Count
599,"(interest, rate)",29
8,"(new, york)",28
649,"(housing, market)",25
9355,"(sub, prime)",23
1671,"(wall, street)",23
81,"(subprime, mortgage)",22
48,"(central, bank)",22
0,"(credit, market)",19
1238,"(federal, reserve)",17
1954,"(rate, cut)",16


### Bigrams Pointwise Mutual Information Table 

In [26]:
bigramFinder.apply_freq_filter(10)

In [27]:
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['Bigram','PMI']).sort_values(by='PMI', ascending=False)
bigramPMITable[:20]

Unnamed: 0,Bigram,PMI
0,"(fortune, 500)",9.970096
1,"(wall, street)",9.854619
2,"(per, cent)",9.809975
3,"(sub, prime)",9.798036
4,"(short, term)",9.720868
5,"(federal, reserve)",9.48003
6,"(percentage, point)",9.165767
7,"(new, york)",8.498475
8,"(back, security)",8.099926
9,"(last, week)",8.09682


### Filter Bigrams PMI Table using Stop Words and Part of Speech Tags 

In [28]:
filtered_bigram_PMI = bigramPMITable[bigramPMITable.Bigram.map(lambda x: bigram_filter(x))]

In [29]:
#top 10 filtered bigrams pmi
filtered_bigram_PMI[:20]

Unnamed: 0,Bigram,PMI
1,"(wall, street)",9.854619
3,"(sub, prime)",9.798036
4,"(short, term)",9.720868
5,"(federal, reserve)",9.48003
6,"(percentage, point)",9.165767
7,"(new, york)",8.498475
9,"(last, week)",8.09682
13,"(central, bank)",7.325366
14,"(interest, rate)",7.245973
20,"(housing, market)",5.965379


## Trigrams Frequency/Count Table

In [30]:
trigram_count = trigramFinder.ngram_fd.items()
trigramCountTable = pd.DataFrame(list(trigram_count), columns=['Trigram','Count']).sort_values(by='Count', ascending=False)

In [31]:
trigramCountTable[:20]

Unnamed: 0,Trigram,Count
80,"(u, s, )",46
437,"(the, u, s)",18
1194,"( , he, say)",15
16801,"( , charts, )",15
1195,"(he, say, )",14
1439,"(the, federal, reserve)",14
16864,"( , fortune, 500)",13
16863,"(charts, , fortune)",13
4779,"(mr, , greenspan)",12
730,"(the, housing, market)",12


### Filter Trigrams Frequency/Count Table using whitespace, Stop Words and Part-of-Speech tags 

In [32]:
#function to filter trigrams using whitespace, stop words and pos tags
def trigram_filter(trigram):
    if trigram[0].isspace() or trigram[1].isspace() or trigram[2].isspace():
        return False
    for word in trigram:
        if word in stopwords:
            return False
    first_word_tag = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_word_tag = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(trigram)
    if tags[0][1] in first_word_tag and tags[2][1] in third_word_tag:
        return True
    else:
        return False

In [33]:
filtered_trigram_count = trigramCountTable[trigramCountTable.Trigram.map(lambda x: trigram_filter(x))]

In [34]:
filtered_trigram_count[:20]

Unnamed: 0,Trigram,Count
5850,"(mortgage, back, security)",10
13047,"(sub, prime, mortgage)",8
640,"(collateralized, debt, obligation)",6
167,"(chief, investment, officer)",6
353,"(adjustable, rate, mortgage)",6
14354,"(sub, prime, lending)",5
3041,"(subprime, mortgage, crisis)",5
382,"(money, market, fund)",5
1503,"(quarter, percentage, point)",4
40,"(reuters, investment, outlook)",4


### Trigrams Pointwise Mutual Information Table 

In [35]:
trigramFinder.apply_freq_filter(5)

In [36]:
trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['Trigram','PMI']).sort_values(by='PMI', ascending=False)
trigramPMITable[:20]

Unnamed: 0,Trigram,PMI
0,"(cnnmoney, com, )",23.353094
1,"(collateralized, debt, obligation)",21.382241
2,"(chief, investment, officer)",18.888426
3,"(nationally, charter, bank)",18.745231
4,"(sub, prime, lending)",17.050483
5,"(s, p, 500)",16.019671
6,"( , cnnmoney, com)",15.990274
7,"(sub, prime, mortgage)",15.854086
8,"(the, united, states)",15.570817
9,"(mortgage, back, security)",15.542034


### Filter Trigrams PMI Table using Stop Words and Part of Speech Tags 

In [37]:
filtered_trigram_PMI = trigramPMITable[trigramPMITable.Trigram.map(lambda x: trigram_filter(x))]

In [38]:
#top 10 filtered trigrams pmi
filtered_trigram_PMI[:20]

Unnamed: 0,Trigram,PMI
1,"(collateralized, debt, obligation)",21.382241
2,"(chief, investment, officer)",18.888426
4,"(sub, prime, lending)",17.050483
7,"(sub, prime, mortgage)",15.854086
9,"(mortgage, back, security)",15.542034
10,"(adjustable, rate, mortgage)",14.596961
16,"(money, market, fund)",13.359318
21,"(subprime, mortgage, crisis)",12.363842


## Comparison of Bigrams and Trigrams 

In [39]:
bi_count_pair = filtered_bigram_count[:20].Bigram.values
bi_pmi_pair = filtered_bigram_PMI[:20].Bigram.values
tri_count_pair = filtered_trigram_count[:20].Trigram.values
tri_pmi_pair = filtered_trigram_PMI[:20].Trigram.values
bi_count = filtered_bigram_count[:20].Count.values
bi_pmi = filtered_bigram_PMI[:20].PMI.values
tri_count = filtered_trigram_count[:20].Count.values
tri_pmi = filtered_trigram_PMI[:20].PMI.values
table = pd.DataFrame([bi_count_pair, bi_count, bi_pmi_pair, bi_pmi, tri_count_pair, tri_count, tri_pmi_pair, tri_pmi]).T
table.columns = ['Bigrams', 'Count', 'Bigrams', 'PMI', 'Trigrams', 'Count', 'Trigrams', 'PMI']

In [40]:
table

Unnamed: 0,Bigrams,Count,Bigrams.1,PMI,Trigrams,Count.1,Trigrams.1,PMI.1
0,"(interest, rate)",29,"(wall, street)",9.85462,"(mortgage, back, security)",10,"(collateralized, debt, obligation)",21.3822
1,"(new, york)",28,"(sub, prime)",9.79804,"(sub, prime, mortgage)",8,"(chief, investment, officer)",18.8884
2,"(housing, market)",25,"(short, term)",9.72087,"(collateralized, debt, obligation)",6,"(sub, prime, lending)",17.0505
3,"(sub, prime)",23,"(federal, reserve)",9.48003,"(chief, investment, officer)",6,"(sub, prime, mortgage)",15.8541
4,"(wall, street)",23,"(percentage, point)",9.16577,"(adjustable, rate, mortgage)",6,"(mortgage, back, security)",15.542
5,"(subprime, mortgage)",22,"(new, york)",8.49848,"(sub, prime, lending)",5,"(adjustable, rate, mortgage)",14.597
6,"(central, bank)",22,"(last, week)",8.09682,"(subprime, mortgage, crisis)",5,"(money, market, fund)",13.3593
7,"(credit, market)",19,"(central, bank)",7.32537,"(money, market, fund)",5,"(subprime, mortgage, crisis)",12.3638
8,"(federal, reserve)",17,"(interest, rate)",7.24597,"(quarter, percentage, point)",4,,
9,"(rate, cut)",16,"(housing, market)",5.96538,"(reuters, investment, outlook)",4,,
