In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Dealing with Texts

Suppose we have a text, like a random article from Wikipedia (see below). Can we figure out what the text is about without reading it?

In [596]:
url = 'https://en.wikipedia.org/wiki/Special:Random'
b = BeautifulSoup(requests.get(url).content, "html.parser")

In [597]:
l = b.find(class_='mw-content-ltr').find(class_='mw-parser-output').find_all(["p"])
text = ' '.join([x.text.strip() for x in l])
text

'Heilsbronn Abbey was a Cistercian monastery at Heilsbronn in the district of Ansbach in Middle Franconia, Bavaria, Germany. It was part of the Diocese of Eichstätt. It was founded in 1132–33 by Saint Otto of Bamberg and was settled by monks from Ebrach Abbey, under the first abbot Rapotho. It was one of the wealthiest monasteries of Germany, with possessions around Franconia as far as Regensburg and in Württemberg. These rich endowments were mostly made by the dukes of Abenberg and their heirs, the Hohenzollern Burgraves of Nuremberg. It was the hereditary burial-place of the Hohenzollern family and ten burgraves of Nuremberg, five margraves and three electors of Brandenburg, besides many other persons of note, were buried here. Heilsbronn was a flourishing monastery until the time of the Reformation. In 1530 Abbot John Schopper (1529–1540) founded a monastic school here, which later became a Protestant school for princes, and the doctrines of Luther gradually found favour in the mona

## A bag of words

The idea here is to split the text into words and count each word. 

In [598]:
# Split the text into words. This produces multiple 'artifacts'
words = text.split(' ')
words

['Heilsbronn',
 'Abbey',
 'was',
 'a',
 'Cistercian',
 'monastery',
 'at',
 'Heilsbronn',
 'in',
 'the',
 'district',
 'of',
 'Ansbach',
 'in',
 'Middle',
 'Franconia,',
 'Bavaria,',
 'Germany.',
 'It',
 'was',
 'part',
 'of',
 'the',
 'Diocese',
 'of',
 'Eichstätt.',
 'It',
 'was',
 'founded',
 'in',
 '1132–33',
 'by',
 'Saint',
 'Otto',
 'of',
 'Bamberg',
 'and',
 'was',
 'settled',
 'by',
 'monks',
 'from',
 'Ebrach',
 'Abbey,',
 'under',
 'the',
 'first',
 'abbot',
 'Rapotho.',
 'It',
 'was',
 'one',
 'of',
 'the',
 'wealthiest',
 'monasteries',
 'of',
 'Germany,',
 'with',
 'possessions',
 'around',
 'Franconia',
 'as',
 'far',
 'as',
 'Regensburg',
 'and',
 'in',
 'Württemberg.',
 'These',
 'rich',
 'endowments',
 'were',
 'mostly',
 'made',
 'by',
 'the',
 'dukes',
 'of',
 'Abenberg',
 'and',
 'their',
 'heirs,',
 'the',
 'Hohenzollern',
 'Burgraves',
 'of',
 'Nuremberg.',
 'It',
 'was',
 'the',
 'hereditary',
 'burial-place',
 'of',
 'the',
 'Hohenzollern',
 'family',
 'and',
 

In [599]:
# Let's clean the words
import re
words = [re.sub('[\W]+','',w).lower().strip() for w in words]
words

['heilsbronn',
 'abbey',
 'was',
 'a',
 'cistercian',
 'monastery',
 'at',
 'heilsbronn',
 'in',
 'the',
 'district',
 'of',
 'ansbach',
 'in',
 'middle',
 'franconia',
 'bavaria',
 'germany',
 'it',
 'was',
 'part',
 'of',
 'the',
 'diocese',
 'of',
 'eichstätt',
 'it',
 'was',
 'founded',
 'in',
 '113233',
 'by',
 'saint',
 'otto',
 'of',
 'bamberg',
 'and',
 'was',
 'settled',
 'by',
 'monks',
 'from',
 'ebrach',
 'abbey',
 'under',
 'the',
 'first',
 'abbot',
 'rapotho',
 'it',
 'was',
 'one',
 'of',
 'the',
 'wealthiest',
 'monasteries',
 'of',
 'germany',
 'with',
 'possessions',
 'around',
 'franconia',
 'as',
 'far',
 'as',
 'regensburg',
 'and',
 'in',
 'württemberg',
 'these',
 'rich',
 'endowments',
 'were',
 'mostly',
 'made',
 'by',
 'the',
 'dukes',
 'of',
 'abenberg',
 'and',
 'their',
 'heirs',
 'the',
 'hohenzollern',
 'burgraves',
 'of',
 'nuremberg',
 'it',
 'was',
 'the',
 'hereditary',
 'burialplace',
 'of',
 'the',
 'hohenzollern',
 'family',
 'and',
 'ten',
 'bur

In [607]:
# It's time to count them. let's find the top 5 words that occur most often in the text
from collections import Counter # this will produce a dictionary with word counts

word_counts = Counter(words)
word_counts = list(zip(word_counts.keys(), word_counts.values())) # tear up the dictionary and sort it
sorted(word_counts, key = lambda x:-x[1])[:10]

[('the', 24),
 ('of', 19),
 ('and', 12),
 ('in', 12),
 ('was', 11),
 ('a', 9),
 ('heilsbronn', 8),
 ('it', 5),
 ('citation', 5),
 ('to', 5)]

## Stop words

The most common English words are: 'the', 'be', 'to', 'of', 'and', ... Interestingly, they are distributed according to Zipf law:


| Rank        | # of occurrences  | word | # of occurrences Zipf |
| ----------- | ---------- | ---- | ---- | ----- |
| 1 | $N$ | the | 12 |
| 2 | $\frac{N}{2}$ | in | 6 |
| 3 | $\frac{N}{3}$ | a | 4 |
| 4 | $\frac{N}{4}$ | of | 3 |
| - | - | - | - |
| k | $\frac{N}{k}$ | - | - |

Anyways, the common English words are not helpful here. They must be excluded from our analysis. So, let's download a list of English stop words:

In [608]:
# let's get rid of the stop words
#%%bash
url_stop_words = "https://gist.githubusercontent.com/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089c9821dbcf6d0c786c"
stop_words = requests.get(url_stop_words).content.decode('utf-8').split('\n')

In [609]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [611]:
# 2nd attempt -- remove all counts of stop words from the words
word_counts2 = words[:] # first, make a copy of the list

for x in range(len(word_counts2)): # update the counters
    word_counts2 = Counter(word_counts2)-Counter(stop_words)

word_counts2 = list(zip(word_counts2.keys(), word_counts2.values())) # tear up the dictionary, and sort it
sorted(word_counts2, key = lambda x:-x[1])[:15] # the most frequent 20 words in the document

[('heilsbronn', 8),
 ('citation', 5),
 ('abbey', 4),
 ('monastery', 4),
 ('1em', 4),
 ('centermwparseroutput', 4),
 ('catholic', 3),
 ('ed', 3),
 ('abbot', 3),
 ('public', 2),
 ('nuremberg', 2),
 ('germany', 2),
 ('franconia', 2),
 ('text', 2),
 ('ceased', 2)]

Do we have a better understanding now of what the document is about? 

# Using sklearn library

Here we revisit the previous task, but this time we will use a class CountVectorizer to do the same thing as before. Next, we will look at TfidfVectorizer class that goes further by finding term frequency and multiplying it by inverse document frequency for each word (or ngram).

## CountVectorizer

This class nicely encapsulates everything that we have done before (and it does more).

In [618]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

X = cv.fit_transform([text]) # Note the square brackets here. cv expects a collection of texts!
X.shape # Notice the shape of the array -- it is 2D!

(1, 292)

In [621]:
X.toarray()

array([[ 3,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  4,  1,  1,  1,  1,  2,  1,  1,  2,  1,  1,  1,  3,
         1,  1,  1,  2,  1,  3,  1,  4,  3,  1,  1,  1,  1,  4,  1,  1, 12,
         1,  1,  1,  1,  3,  3,  2,  1,  9,  1,  1,  1,  2,  1,  1,  1,  1,
         2,  1,  1,  1,  1,  2,  1,  1,  1,  4,  1,  3,  1,  2,  4,  1,  1,
         1,  1,  1,  6,  1,  2,  3,  4,  1, 20,  1,  1,  1,  1,  1,  1,  2,
         1,  1,  2,  1,  1,  1,  1,  3,  1,  1,  1,  1,  1,  1,  2,  1,  1,
         1,  1,  1,  1,  1,  2,  1,  4,  2,  1,  1,  2,  2,  1,  3,  2,  1,
         1,  2,  2,  2,  1,  1,  8,  1,  1,  1,  2,  1,  1,  1,  2,  1,  1,
         1, 12,  2,  5,  5,  1,  1,  4,  1,  1,  1,  1,  4,  1,  1, 10,  2,
         1,  1,  2,  1,  1,  1,  1,  1,  1,  1,  1,  4,  1,  1,  1,  2, 22,
         1,  1,  1,  1,  4,  2,  1,  2,  2, 19,  1,  1,  1,  4,  1,  1,  1,
        22,  3,  1, 22,  1,  1,  1,  4,  1,  4,  1,  1,  1,  1,  1,  1,  1,
         1, 

In [623]:
X = cv.fit_transform([text,'anton wants to walk'])
X.shape

(2, 295)

In [624]:
X.toarray()[1,:]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [626]:
len(cv.get_feature_names()) # this is the vocabulary, i.e. the words CountVectorizer found in the text

295

In [627]:
cv.get_feature_names()

['10',
 '100',
 '1132',
 '11th',
 '12px',
 '13',
 '14th',
 '1529',
 '1530',
 '1540',
 '1543',
 '1549',
 '1555',
 '1562',
 '1578',
 '16',
 '1631',
 '1851',
 '1866',
 '1911',
 '1913',
 '1em',
 '1px',
 '20',
 '212',
 '213',
 '2em',
 '31',
 '33',
 '33778',
 '33aa33',
 '3em',
 '47',
 '49',
 '4c',
 '555',
 '65',
 '79194',
 '95',
 '9px',
 'aa',
 'abbey',
 'abbot',
 'abbots',
 'abenberg',
 'albrecht',
 'alexius',
 'alt',
 'although',
 'an',
 'and',
 'ansbach',
 'anton',
 'any',
 'appleton',
 'around',
 'article',
 'as',
 'at',
 'author',
 'background',
 'bamberg',
 'basilica',
 'bavaria',
 'be',
 'became',
 'belief',
 'besides',
 'between',
 'border',
 'bottom',
 'brandenburg',
 'britannica',
 'buildings',
 'burgraves',
 'burial',
 'buried',
 'but',
 'by',
 'cambridge',
 'catholic',
 'catholicism',
 'ceased',
 'center',
 'century',
 'charles',
 'chisholm',
 'church',
 'cistercian',
 'citation',
 'cite',
 'code',
 'color',
 'commons',
 'coordinates',
 'cs1',
 'cursor',
 'd6',
 'des',
 'didactic

## Adding stop words

Let's take care of the stop words

In [631]:
cv = CountVectorizer(stop_words = 'english') # we can pass them as a parameter

X = cv.fit_transform([text])

In [632]:
X.toarray()

array([[ 3,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  4,  1,  1,  1,  1,  2,  1,  1,  2,  1,  1,  1,  3,
         1,  1,  1,  2,  1,  3,  1,  4,  3,  1,  1,  1,  1,  4,  1,  1,  3,
         1,  9,  1,  1,  1,  1,  2,  1,  1,  1,  2,  1,  1,  1,  3,  1,  2,
         4,  1,  1,  1,  1,  1,  6,  1,  2,  3,  4,  1, 20,  1,  1,  1,  1,
         1,  1,  2,  1,  1,  2,  1,  1,  1,  1,  3,  1,  1,  1,  1,  1,  1,
         2,  1,  1,  1,  1,  1,  1,  1,  4,  1,  2,  2,  1,  2,  1,  1,  2,
         2,  1,  8,  1,  1,  1,  1,  1,  2,  1,  1,  1,  2,  5,  1,  4,  1,
         1,  1,  4,  1,  1, 10,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  4,
         1,  1,  1, 22,  1,  1,  1,  1,  2,  1,  4,  1,  1, 22,  3,  1, 22,
         1,  1,  4,  1,  4,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  1,
         1,  2,  1,  1,  4,  4,  1,  2,  1,  7,  1,  1,  1,  1,  2,  1,  1,
         1,  1,  3,  2,  1,  4,  1,  1,  1,  8,  1,  2,  4,  1,  1,  1,  1,
         4, 

In [633]:
len(cv.get_feature_names()) # note that the vocabulary is now smaller

237

## Dealing with multiple texts

Let's get serious and (attempt to) analyze some real texts. We will be dealing with excerpts from ISIS religious texts. The dataset is available at ../data/ISIS_Religious_Texts.csv.

In [636]:
df = pd.read_csv('../data/ISIS_Religious_Texts.csv', encoding='cp1252')
df.Quote.head(20)

0     The spark has been lit here in Iraq, and its h...
1     The Hour will not be established until the Rom...
2     The spark has been lit here in Iraq, and its h...
3     O Muslims everywhere, glad tidings to you and ...
4     O Ummah of Islam, indeed the world today has b...
5     Therefore, rush O Muslims to your state. Yes, ...
6     We make a special call to the scholars, fuqaha...
7     Abdullah Ibn Amr narrated that the Prophet (sa...
8     {And do not be like the one who undoes the thr...
9     In explaining this verse, the scholars of tafs...
10    Abdullah Ibn Umar (radiyallahu anhuma) narrate...
11    In a part of the hadith of Hudhayfah (radiyall...
12    Qays Ibn Hazim narrated that Abu Bakr entered ...
13    {He (Ibrahim) pleaded, And also (leaders) from...
14    {(Allah) said, My covenant does not include th...
15    A number of scholars used this verse as eviden...
16    {And (remember) when Ibrahim was tried by his ...
17    Ibn Kathir (rahimahullah) states, His (sub

In [638]:
df.head()

Unnamed: 0,Magazine,Issue,Date,Type,Source,Quote,Purpose,Article Name
0,Dabiq,1.0,Jun-14,Jihadist,Abu Mus'ab az-Zarqawi,"The spark has been lit here in Iraq, and its h...",Support,First Page
1,Dabiq,1.0,Jun-14,Hadith,Sahih Muslim,The Hour will not be established until the Rom...,Support,Introduction
2,Dabiq,1.0,Jun-14,Jihadist,Abu Mus'ab az-Zarqawi,"The spark has been lit here in Iraq, and its h...",Support,Introduction
3,Dabiq,1.0,Jun-14,Jihadist,Abu Bakr al-Baghdadi,"O Muslims everywhere, glad tidings to you and ...",Support,Khilafah Declared
4,Dabiq,1.0,Jun-14,Jihadist,Abu Bakr al-Baghdadi,"O Ummah of Islam, indeed the world today has b...",Support,The World has Divided into Two Camps


In [635]:
cv = CountVectorizer(stop_words = stop_words) # Use CountVectorizer

X = cv.fit_transform(list(df.dropna().Quote))
X

<826x5940 sparse matrix of type '<class 'numpy.int64'>'
	with 26320 stored elements in Compressed Sparse Row format>

In [223]:
X.toarray() # this is a sparse array, due to vast amount of zeros

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Term Frequency is not enough

There may be words like 'Allah' that appear in most texts in ISIS literature. Therefore, their informational value has to be lower. To accomplish this, we can compute the document frequency of each word and scale adjust it's score:
* Term Frequency ($TF_{word}$) -- frequency of *word* in the document
* Document Frequency ($DF_{word}$) -- how many documents in the corpus contain *word*
* Inverted Document Frequency ($IDF_{word} = log\left(\frac{N}{DF_{word}}\right)$) -- logarighm of 1/DF
* N -- number of documents in corpus

$tfidf_{word} = TF_{word}*log\left(\frac{N}{DF_{word}}\right)$, where TF -- term frequency and DF  

So, for instance, suppose that 'Allah' appears in almost every document in the corpus. It's DF will be close to N, hence $log\left(\frac{N}{DF_{allah}}\right) \approx 0$ and the score will be very low.

In [640]:
# TfidfVectorizer will do the transformation for us
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(stop_words = stop_words)

X = tf.fit_transform(list(df.dropna().Quote))
X

<826x5940 sparse matrix of type '<class 'numpy.float64'>'
	with 26320 stored elements in Compressed Sparse Row format>

In [641]:
tf.vocabulary_

{'bir': 886,
 'midst': 3495,
 'unlike': 5606,
 'plural': 4072,
 'heed': 2620,
 'surrender': 5228,
 'appoint': 540,
 'tanw': 5287,
 'occurred': 3803,
 'since': 4939,
 'pleasing': 4062,
 'marry': 3422,
 'rage': 4322,
 'waged': 5706,
 'harden': 2570,
 'iranian': 2968,
 'murder': 3625,
 'jihadi': 3048,
 'feeding': 2187,
 'realities': 4377,
 'months': 3566,
 'distance': 1746,
 'npr': 3755,
 'asia': 610,
 'agent': 341,
 'heedless': 2621,
 'affirmation': 328,
 'adherents': 286,
 'rapidly': 4357,
 'tawbah': 5304,
 'went': 5772,
 'slaughtering': 4972,
 'ahkamulqur': 357,
 'sat': 4736,
 'hakeem': 2543,
 'ut': 5642,
 'vow': 5699,
 'animals': 493,
 'program': 4188,
 'offered': 3816,
 'tested': 5339,
 'foundation': 2339,
 'qaida': 4266,
 'seem': 4799,
 'financial': 2231,
 'ago': 349,
 'diplomats': 1671,
 'abandoners': 189,
 'explicated': 2079,
 'easier': 1848,
 'money': 3562,
 'leadership': 3216,
 'deep': 1509,
 'attitude': 678,
 'embraced': 1890,
 'ism': 2991,
 'booty': 933,
 'rapid': 4356,
 'stro

In [642]:
tf.idf_ # This is the inverse document frequency of each word

array([ 6.61919241,  5.63836315,  6.33151033, ...,  7.02465751,
        7.02465751,  7.02465751])

In [645]:
idf_list = list(zip(tf.get_feature_names(),tf.idf_)) # list of features (words) with their IDF
idf_list[:15]

[('000', 6.619192406355582),
 ('10', 5.638363153343855),
 ('100', 6.3315103339038004),
 ('103', 6.1083667825895906),
 ('104', 7.0246575144637458),
 ('104ah', 6.619192406355582),
 ('105', 6.1083667825895906),
 ('107', 7.0246575144637458),
 ('108', 6.619192406355582),
 ('11', 5.3199094222253205),
 ('110', 6.619192406355582),
 ('110ah', 7.0246575144637458),
 ('111', 6.619192406355582),
 ('112', 6.3315103339038004),
 ('113', 6.619192406355582)]

In [650]:
sorted(idf_list, key = lambda x:x[1])[:15] # these are the most common words across the text corpus

[('allah', 1.546104097612776),
 ('said', 1.7188681330770084),
 ('al', 2.0618128842038388),
 ('hu', 2.3333096322346023),
 ('wa', 2.3471666668960287),
 ('people', 2.3754704430588802),
 ('alayhi', 2.3996847011794751),
 ('sallam', 2.404598715981904),
 ('sallall', 2.4499465359603629),
 ('ibn', 2.6118592211231109),
 ('messenger', 2.6679486877741541),
 ('indeed', 2.713858389078232),
 ('one', 2.7273721082449551),
 ('upon', 2.9642145039173267),
 ('would', 3.0449758605617849)]

In [651]:
def get_doc_scores(doc_no):
    return list(zip(tf.get_feature_names(),list(X[doc_no].toarray().reshape(-1))))

get_doc_scores(1)[:5]

[('000', 0.0), ('10', 0.0), ('100', 0.0), ('103', 0.0), ('104', 0.0)]

In [653]:
def get_top_words(doc_no, num_words = 5):
    scores = get_doc_scores(doc_no)
    return sorted(scores, key = lambda x:-x[1])[:num_words]

get_top_words(0, 10)

[('spark', 0.32609825919057345),
 ('burns', 0.32076466113201402),
 ('heat', 0.32076466113201402),
 ('intensify', 0.32076466113201402),
 ('lit', 0.31131557763170037),
 ('dabiq', 0.30313040919626949),
 ('armies', 0.29591057896208078),
 ('crusader', 0.289452220742691),
 ('continue', 0.28088513388229702),
 ('permission', 0.27827632702633631)]

In [655]:
get_top_words(1, 10)

[('leave', 0.27940078653209638),
 ('third', 0.2637816425316632),
 ('shaytan', 0.22376581903664558),
 ('melt', 0.20168607404529448),
 ('conquer', 0.18877025119986832),
 ('false', 0.17960632905394336),
 ('ranks', 0.16946219607346816),
 ('romans', 0.16946219607346816),
 ('families', 0.16669050620851722),
 ('never', 0.13645817285756195)]

In [657]:
get_top_words(50)

[('curse', 0.48858369093161591),
 ('hajar', 0.38779923529870036),
 ('ibn', 0.33163594941494356),
 ('arab', 0.32713709419136733),
 ('heretic', 0.29731425865572386)]

## CountVectorizer and TfidfVectorizer tweaks

In [658]:
# Supplying 'english' instead of stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(
                    stop_words = 'english', # supply internal stop words vocabulary for English
                    min_df = 2, # minimum document frequency, i.e. words that occur in at least two documents
                    #max_df = ?, #maximum document frequency
                    #max_features = 100, # restrict vocabulary to ??? most frequent words
                    )

X = tf.fit_transform(list(df.dropna().Quote))
X

<826x2934 sparse matrix of type '<class 'numpy.float64'>'
	with 20847 stored elements in Compressed Sparse Row format>

### NGrams and Word Vectors

![](https://cdn-images-1.medium.com/max/1600/0*XMW5mf81LSHodnTi.png)

In [663]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(
                    stop_words = 'english', # supply internal stop words vocabulary for English
                    ngram_range = (1,2), # search for word combinations from ? to ? words
                    min_df = 5 # let's only show word combinations appearing in at least ? documents
                    )

X = tf.fit_transform(list(df.dropna().Quote))
X

<826x1392 sparse matrix of type '<class 'numpy.float64'>'
	with 19770 stored elements in Compressed Sparse Row format>

In [665]:
get_top_words(2, 10)

[('burns crusader', 0.22846046649877758),
 ('continue intensify', 0.22846046649877758),
 ('heat continue', 0.22846046649877758),
 ('intensify allah', 0.22846046649877758),
 ('iraq heat', 0.22846046649877758),
 ('lit iraq', 0.22846046649877758),
 ('permission burns', 0.22846046649877758),
 ('spark', 0.22846046649877758),
 ('spark lit', 0.22846046649877758),
 ('burns', 0.22472381269510502)]

## Topics and Visualization

Below we will try to further understand the texts by allocating topics and visualizing them

In [549]:
cv = TfidfVectorizer(stop_words = 'english')
X = cv.fit_transform(list(df.dropna().Quote))
X

<826x5796 sparse matrix of type '<class 'numpy.float64'>'
	with 23709 stored elements in Compressed Sparse Row format>

### Latent Dirichlet Allocation

This is a non-supervised machine learning technique that converts an NxM matrix to a NxT matrix, where N -- number of documents, M -- vocabulary size, T -- number of topics. 

The latter matrix (NxT) `assigns` each document to a topic.

The number of topics can be chosen freely

In [550]:
from sklearn.decomposition import LatentDirichletAllocation

n_topics = 5 # let's find ? topics
max_iter = 2500

lda_model = LatentDirichletAllocation(n_components = n_topics, verbose=1, evaluate_every=5, max_iter=max_iter)
X_topics = lda_model.fit_transform(X)

iteration: 1 of max_iter: 2500
iteration: 2 of max_iter: 2500
iteration: 3 of max_iter: 2500
iteration: 4 of max_iter: 2500
iteration: 5 of max_iter: 2500, perplexity: 26304.9798
iteration: 6 of max_iter: 2500
iteration: 7 of max_iter: 2500
iteration: 8 of max_iter: 2500
iteration: 9 of max_iter: 2500
iteration: 10 of max_iter: 2500, perplexity: 23184.2006
iteration: 11 of max_iter: 2500
iteration: 12 of max_iter: 2500
iteration: 13 of max_iter: 2500
iteration: 14 of max_iter: 2500
iteration: 15 of max_iter: 2500, perplexity: 20339.8410
iteration: 16 of max_iter: 2500
iteration: 17 of max_iter: 2500
iteration: 18 of max_iter: 2500
iteration: 19 of max_iter: 2500
iteration: 20 of max_iter: 2500, perplexity: 19002.7451
iteration: 21 of max_iter: 2500
iteration: 22 of max_iter: 2500
iteration: 23 of max_iter: 2500
iteration: 24 of max_iter: 2500
iteration: 25 of max_iter: 2500, perplexity: 18223.3987
iteration: 26 of max_iter: 2500
iteration: 27 of max_iter: 2500
iteration: 28 of max_iter

In [666]:
X_topics.shape # this is the assignment of texts to topics

(826, 5)

In [667]:
X_topics[0,:] # the major topic of the first document is #2 (zero based)

array([ 0.04562792,  0.81754029,  0.04561923,  0.04560722,  0.04560535])

In [553]:
np.argmax(X_topics[0,:]) # this is a programmatic way of finding the topic of the first document

1

In [554]:
topics = np.argmax(X_topics, axis = 1) # the topics of each document
_lda_keys = list(topics)
_lda_keys[:5]

[1, 1, 1, 1, 1]

### Finding Key Words per Topics

We have allocated topics for each document. Can we interpret what each topic is about?

In [555]:
lda_model.components_.shape # this matrix allocates topics by words in vocabulary 

(5, 5796)

In [556]:
lda_model.components_[0,:] # this vector shows the 'importance' of each word in topic 0

array([ 0.20002382,  0.20012466,  0.53078406, ...,  0.20004456,
        0.20003042,  0.20003814])

In [557]:
n_top_words = 5
np.argsort(-lda_model.components_[0,:])[:n_top_words] # let's sort the words and take the first n_top_words

array([ 420,  383,  470, 4651,  468])

In [558]:
# cv.get_feature_names() -- this is vocabulary, which is an ndarray of words
# ... and we use the previous indices to access the vocabulary
np.array(cv.get_feature_names())[np.argsort(-lda_model.components_[0,:])[:n_top_words]]

array(['allah', 'al', 'angels', 'say', 'anfal'],
      dtype='<U16')

In [669]:
# let's wrap up the code above to make a function that produces topic summaries:

def get_topic_summaries(lda = lda_model, cv = cv, n_top_words = 3):
    n_topics = lda.components_.shape[0]
    topic_summaries = []
    for t in range(n_topics):
        topic_words = list(np.array(cv.get_feature_names())[np.argsort(-lda.components_[t,:])[:n_top_words]])
        topic_summaries.append(' '.join(topic_words))
    return topic_summaries

topic_summaries = get_topic_summaries(lda_model, cv, 3)
topic_summaries

['aljuwayn akin analyst',
 'aljuwayn ridiculed highest',
 'per aljuwayn akin',
 'sealed fatir 76',
 'hab syrian 14']

### Visualize the texts

In order to visualize, we need to compress the space further to make 2 dimensions. One algorithm to help us here is t-Distributed Stochastic Neighborhood Embedding (tSNE). Here is more information with examples: https://lvdmaaten.github.io/tsne/

Here is an illustration of algorithm work on MNIST (hand-written digits) dataset:

![](https://lvdmaaten.github.io/tsne/examples/mnist_tsne.jpg)

In [560]:
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca', perplexity = 1.3)
tsne_lda = tsne_model.fit_transform(X_topics)

[t-SNE] Computing 4 nearest neighbors...
[t-SNE] Indexed 826 samples in 0.001s...
[t-SNE] Computed neighbors for 826 samples in 0.004s...
[t-SNE] Computed conditional probabilities for sample 826 / 826
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 84.571693
[t-SNE] KL divergence after 1000 iterations: 0.499274


### Visualization (finally)

Parts of the code below were taken from here: https://shuaiw.github.io/2016/12/22/topic-modeling-and-tsne-visualzation.html

In [671]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool

In [672]:
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

plot_lda = bp.figure(plot_width=800, plot_height=600,
                     title='ISIS Quotes',
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[_lda_keys],
                 source=bp.ColumnDataSource({
                   "content": df.dropna().Quote,
                   "topic_key": _lda_keys
                   }))

Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)


In [673]:
from IPython.display import HTML

# randomly choose a news (within a topic) coordinate as the crucial words coordinate
topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
    if not np.isnan(topic_coord).any():
        break
    topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in range(X_topics.shape[1]):
    plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "topic: @topic_key\n@content"}

save(plot_lda, './quotes.html')
HTML(filename="./quotes.html")

