### Import Dependencies

In [377]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [378]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\camro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Import Data

In [379]:
from sklearn.datasets import fetch_20newsgroups
categories = ['talk.politics.mideast']
dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

### Define Stopwords

In [380]:
stopset = set(stopwords.words('english'))
stopset.update(['From', 'Subject', 'Re', 'Lines', 'In-reply-to', 'Organization', 'NNTP-Posting-Host', '\n', 'GMT', 'writes', 'wrote', 'edu', 'com', 'ed', 'professor', '00', '150', 'bony', '000', 'also', 'said', 'university', 'history', 'cs', 'bony1', 'like', 'org', 'could', 'would', 'say', 'nntp', 'posting', 'host', 'might', 'subject', 'uucp', 'organization', '>', '000246', '0000', '11186', '000413', '25123', 'reply', '00081100', 'ysub', 'ysu', '000th', '0006', '003336', '10198', '002811', '22496', '002118', '24102'])

### TF-IDF Vectorizing

In [381]:
# Example document before metadata removal and vectorizing
corpus[0]

'From: amoss@shuldig.cs.huji.ac.il (Amos Shapira)\nSubject: Re: Final Solution in Palestine ?\nOrganization: Inst. of Comp. Sci., Hebrew University, Jerusalem, Israel\nLines: 30\nNNTP-Posting-Host: shuldig.cs.huji.ac.il\nIn-reply-to: ahmeda@McRCIM.McGill.EDU\'s message of Sun, 25 Apr 93 17:10:03 GMT\n\nahmeda@McRCIM.McGill.EDU (Ahmed Abu-Abed) writes:\n\n|What Hamas and Islamic Jihad believe in, as far as I can get from the Arab\n|media,\n|is an Islamic state that protects the rights of all its inhabitants under\n|Koranic\n|Law. This would be a reversal of the 1948 situation in which the Jews in\n|Palestine took control of the land and its (mostly Muslim) inhabitants.\n\nThe borders of the Jewish state as drawn by the U.N. included the areas which\ncontained mostly Jews,  that\'s what the surveys and the numerous commitees\nwhere after when they visited here.\n\n|However, whoever committed crimes against humanity (torture, blowing up their\n|homes, murders,...) must be treated and trie

In [382]:
# Remove metadata by locating the index of the last instance of 'writes:' and slicing from the beginning of the doc to this index
corpusWithoutMetadata = []
target = 'writes:'
for doc in corpus:
    targetIdx = doc.rfind(target)
    if targetIdx >= 0:
        corpusWithoutMetadata.append(doc[targetIdx + 7:])
    else:
        corpusWithoutMetadata.append(doc)

In [383]:
# Example document after metadata removal
corpusWithoutMetadata[0]

'\n\n|What Hamas and Islamic Jihad believe in, as far as I can get from the Arab\n|media,\n|is an Islamic state that protects the rights of all its inhabitants under\n|Koranic\n|Law. This would be a reversal of the 1948 situation in which the Jews in\n|Palestine took control of the land and its (mostly Muslim) inhabitants.\n\nThe borders of the Jewish state as drawn by the U.N. included the areas which\ncontained mostly Jews,  that\'s what the surveys and the numerous commitees\nwhere after when they visited here.\n\n|However, whoever committed crimes against humanity (torture, blowing up their\n|homes, murders,...) must be treated and tried as a war criminal. The political\n|thought of these movements shows that a freedom of choice will be given to the\n|Jews in living under the new law or leaving to the destintion of their choice.\n\nI never touched an Arab during my army service and never voted for anyone more\nright than the Green party.  Will I be spared by these "humanist standar

In [384]:
# Initialize vectorizer
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1,3))

In [385]:
# Populate maxtrix
X = vectorizer.fit_transform(corpus)

In [386]:
# Transformed document after TF-IDF
X[0]

<1x319861 sparse matrix of type '<class 'numpy.float64'>'
	with 430 stored elements in Compressed Sparse Row format>

In [387]:
# Display term index and associated TF-IDF score
# Only terms with a TF-IDF score > 0 are stored
print(X[0])

  (0, 142193)	0.0440712732703
  (0, 14116)	0.0440712732703
  (0, 23494)	0.0440712732703
  (0, 151018)	0.0440712732703
  (0, 12415)	0.0440712732703
  (0, 154736)	0.0440712732703
  (0, 47349)	0.0440712732703
  (0, 224150)	0.0440712732703
  (0, 13856)	0.0440712732703
  (0, 134580)	0.0440712732703
  (0, 128766)	0.0440712732703
  (0, 278513)	0.0440712732703
  (0, 75117)	0.0440712732703
  (0, 224168)	0.0440712732703
  (0, 293000)	0.0440712732703
  (0, 106080)	0.0440712732703
  (0, 158111)	0.0440712732703
  (0, 258153)	0.043171256467
  (0, 23487)	0.043171256467
  (0, 81516)	0.0546643795893
  (0, 60547)	0.0546643795893
  (0, 295827)	0.0546643795893
  (0, 261839)	0.0546643795893
  (0, 62669)	0.0546643795893
  (0, 313512)	0.0546643795893
  :	:
  (0, 27797)	0.0263815801851
  (0, 8065)	0.0262878205919
  (0, 276382)	0.039795075007
  (0, 185207)	0.0261036837065
  (0, 182718)	0.0592453951354
  (0, 182839)	0.0643300113931
  (0, 19621)	0.0847323280043
  (0, 8965)	0.0254085400591
  (0, 171836)	0.0076901

### Latent Semantic Analysis
Matrix Decomponsition:

$$X \approx USV^{T}$$

**X**: m x n matrix, where m = # documents, n = # terms, k = # concepts

**U**: m x k matrix, where row (m) = documents and col (k) = concepts

**S**: k x k diagonal matix, which shows amt of variation captured from each concept

**V**: n x k matrix, where row (n) = terms and col (k) = concepts

In [388]:
# Number of documents by number of terms
# Term count includes bigrams and trigrams in addition to singular terms
X.shape

(940, 319861)

In [389]:
# Instantiate an instance of TruncatedSVD
# TruncatedSVD is engine used to perform matrix decomposition
lsa = TruncatedSVD(n_components=12, n_iter=100, random_state=333)

In [390]:
# Decompose maxtrix X into matrices U, S, and V
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=12, n_iter=100,
       random_state=333, tol=0.0)

In [391]:
# First row of V
# V(row) = term, V(col) = concept
# Each value represents the importance of the term to the concept
lsa.components_[0]

array([ 0.00024147,  0.00024147,  0.00024147, ...,  0.00134017,
        0.00134017,  0.00134017])

In [392]:
# For each concept return the top 12 most significant terms sorted by significance in descending order
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:12]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print (" ")

Concept 0:
armenian
armenians
turkish
israel
people
jews
israeli
serdar
armenia
one
argic
serdar argic
 
Concept 1:
armenian
turkish
istanbul
005225
turkey
many
ankara
genocide
new
russian
soviet armenia
people
 
Concept 2:
istanbul
ankara
osmanli
ermeni
foreign office
new york
office
york
mecmuasi
nezareti
umumiye
1983
 
Concept 3:
russian
armenians
genocide
world
government
ottoman
van
israel
fo
war
jews
state
 
Concept 4:
government
hojali
serdar
azerbaijan
israel
dead
jews
turkish
genocide
soviet armenia
new
azeri
 
Concept 5:
turkish
tartar
first
government
paragraph
turkey
zuma
005019 10716 midway
greek
even
rights
sdpa
 
Concept 6:
israeli
world
005019 10716
tartars
know
armenians
home
one
tartar
today
think
israel
 
Concept 7:
005225
first
israeli
yalanci
paragraph
hojali
fact
dead
005019 10716 midway
even
turkey
armenia
 
Concept 8:
one
005225 8231
turkish
article
jews
war
armenia
ottoman
jews latvia
latvia
armenian
russian
 
Concept 9:
israel
jews
soviet
armenian
004917 3047 