### Import Dependencies

In [7]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\camro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Import Data

In [9]:
from sklearn.datasets import fetch_20newsgroups
categories = ['talk.politics.mideast']
dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

### Define Stopwords

In [13]:
stopset = set(stopwords.words('english'))
#stopset.update(['From:', 'Subject:', 'Re:', 'Lines:', 'In-reply-to:', 'Organization:', 'NNTP-Posting-Host:', '\n', 'GMT', 'writes:', 'wrote:'])

### TF-IDF Vectorizing

In [43]:
# Example document before vectorizing
corpus

['From: amoss@shuldig.cs.huji.ac.il (Amos Shapira)\nSubject: Re: Final Solution in Palestine ?\nOrganization: Inst. of Comp. Sci., Hebrew University, Jerusalem, Israel\nLines: 30\nNNTP-Posting-Host: shuldig.cs.huji.ac.il\nIn-reply-to: ahmeda@McRCIM.McGill.EDU\'s message of Sun, 25 Apr 93 17:10:03 GMT\n\nahmeda@McRCIM.McGill.EDU (Ahmed Abu-Abed) writes:\n\n|What Hamas and Islamic Jihad believe in, as far as I can get from the Arab\n|media,\n|is an Islamic state that protects the rights of all its inhabitants under\n|Koranic\n|Law. This would be a reversal of the 1948 situation in which the Jews in\n|Palestine took control of the land and its (mostly Muslim) inhabitants.\n\nThe borders of the Jewish state as drawn by the U.N. included the areas which\ncontained mostly Jews,  that\'s what the surveys and the numerous commitees\nwhere after when they visited here.\n\n|However, whoever committed crimes against humanity (torture, blowing up their\n|homes, murders,...) must be treated and tri

In [15]:
# Initialize vectorizer
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1,3),)

In [16]:
# Populate maxtrix
X = vectorizer.fit_transform(corpus)

In [17]:
# Transformed document after TF-IDF
X[0]

<1x327287 sparse matrix of type '<class 'numpy.float64'>'
	with 464 stored elements in Compressed Sparse Row format>

In [23]:
# Display term index and associated TF-IDF score
# Only terms with a TF-IDF score > 0 are stored
print(X[0])

  (0, 144231)	0.0425783728261
  (0, 14472)	0.0425783728261
  (0, 24471)	0.0425783728261
  (0, 152971)	0.0425783728261
  (0, 12799)	0.0425783728261
  (0, 156576)	0.0425783728261
  (0, 47915)	0.0425783728261
  (0, 227083)	0.0425783728261
  (0, 14223)	0.0425783728261
  (0, 304075)	0.0425783728261
  (0, 136286)	0.0425783728261
  (0, 130535)	0.0425783728261
  (0, 283726)	0.0425783728261
  (0, 75567)	0.0425783728261
  (0, 227101)	0.0425783728261
  (0, 298053)	0.0425783728261
  (0, 108032)	0.0425783728261
  (0, 159911)	0.0425783728261
  (0, 262917)	0.0417088438074
  (0, 24461)	0.0417088438074
  (0, 82501)	0.0528126410187
  (0, 60902)	0.0528126410187
  (0, 300869)	0.0528126410187
  (0, 266558)	0.0528126410187
  (0, 63008)	0.0528126410187
  :	:
  (0, 140614)	0.0144758240949
  (0, 226598)	0.0136611835371
  (0, 202168)	0.0145790770225
  (0, 9416)	0.0245478338003
  (0, 174416)	0.00742968672947
  (0, 152874)	0.0281145455242
  (0, 156568)	0.0511592426124
  (0, 304060)	0.0263281984064
  (0, 136246)	0

### Latent Semantic Analysis
Matrix Decomponsition:

$$X \approx USV^{T}$$

**X**: m x n matrix, where m = # documents, n = # terms, k = # concepts

**U**: m x k matrix, where row (m) = documents and col (k) = concepts

**S**: k x k diagonal matix, which shows amt of variation captured from each concept

**V**: n x k matrix, where row (n) = terms and col (k) = concepts

In [24]:
# Number of documents by number of terms
# Term count includes bigrams and trigrams in addition to singular terms
X.shape

(940, 327287)

In [26]:
# Instantiate an instance of TruncatedSVD
# TruncatedSVD is engine used to perform matrix decomposition
lsa = TruncatedSVD(n_components=940, n_iter=100, random_state=333)

In [29]:
# Decompose maxtrix X into matrices U, S, and V
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=940, n_iter=100,
       random_state=None, tol=0.0)

In [30]:
# First row of V
# V(row) = term, V(col) = concept
# Each value represents the importance of the term to the concept
lsa.components_[0]

array([ 0.00742081,  0.00014406,  0.00014406, ...,  0.00106203,
        0.00106203,  0.00106203])

In [31]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print (" ")

Concept 0:
edu
armenian
israel
armenians
turkish
people
jews
israeli
jake
one
 
Concept 1:
jake
israel
edu
bony
bony com
bony1
bony1 bony
bony1 bony com
israeli
jake livni
 
Concept 2:
jake
bony
bony com
bony1
bony1 bony
bony1 bony com
jake livni
livni
jake bony1
jake bony1 bony
 
Concept 3:
uci
714
uci edu
tim
irvine
oac
oac uci
oac uci edu
orion
orion oac
 
Concept 4:
muslims
irvine
uci
muslim
serbs
prabhakar
satya
satya prabhakar
umn
714
 
Concept 5:
adam
istanbul
professor
ed
jews
ankara
professor history
university
osmanli
ermeni
 
Concept 6:
virginia
edu
virginia edu
pro
cosmo
columbia
angmar
pro angmar
columbia edu
andi
 
Concept 7:
israel
com
angmar
pro angmar
cosmo
hernlem
pro
lebanese
istanbul
lebanon
 
Concept 8:
istanbul
turkey
ankara
ed
osmanli
professor
ermeni
professor history
foreign office
history
 
Concept 9:
virginia
hernlem
lebanese
virginia edu
israeli
israel
lebanon
ncsu
brad
ncsu edu
 
Concept 10:
adam
lebanese
adam shostack
hernlem
shostack
harvard
ncsu edu
turk