In [26]:
from nltk import word_tokenize, sent_tokenize
from sklearn.datasets import fetch_20newsgroups

In [27]:
sport_categories = ['rec.sport.baseball', 'rec.sport.hockey']
data_train = fetch_20newsgroups(categories=sport_categories, subset='train', shuffle=True, random_state=42)
data_test = fetch_20newsgroups(categories=sport_categories, subset='test', shuffle=True, random_state=42)

This way we downloaded a sample from the 20 news groups dataset as a `sklearn.utils.Bunch` object.

In [28]:
type(data_train)

sklearn.utils.Bunch

Let's do a quick peek into the data, i.e., check out the contents of the first train document and its corresponding label.

In [29]:
print(data_train.data[0], data_train.target[0])

From: dougb@comm.mot.com (Doug Bank)
Subject: Re: Info needed for Cleveland tickets
Reply-To: dougb@ecs.comm.mot.com
Organization: Motorola Land Mobile Products Sector
Distribution: usa
Nntp-Posting-Host: 145.1.146.35
Lines: 17

In article <1993Apr1.234031.4950@leland.Stanford.EDU>, bohnert@leland.Stanford.EDU (matthew bohnert) writes:

|> I'm going to be in Cleveland Thursday, April 15 to Sunday, April 18.
|> Does anybody know if the Tribe will be in town on those dates, and
|> if so, who're they playing and if tickets are available?

The tribe will be in town from April 16 to the 19th.
There are ALWAYS tickets available! (Though they are playing Toronto,
and many Toronto fans make the trip to Cleveland as it is easier to
get tickets in Cleveland than in Toronto.  Either way, I seriously
doubt they will sell out until the end of the season.)

-- 
Doug Bank                       Private Systems Division
dougb@ecs.comm.mot.com          Motorola Communications Sector
dougb@nwu.edu       

In [30]:
sent_text = sent_tokenize(data_train.data[0])

In [31]:
print(sent_text[1])

|> Does anybody know if the Tribe will be in town on those dates, and
|> if so, who're they playing and if tickets are available?


In [32]:
print(word_tokenize(sent_text[1]))

['|', '>', 'Does', 'anybody', 'know', 'if', 'the', 'Tribe', 'will', 'be', 'in', 'town', 'on', 'those', 'dates', ',', 'and', '|', '>', 'if', 'so', ',', 'who', "'re", 'they', 'playing', 'and', 'if', 'tickets', 'are', 'available', '?']


## Exercise

Figure out what are the top-10 most frequent words for the two document classes.

## Creating the term-document matrix

We will use `sklearn.feature_extraction.text.{CountVectorizer,TfidfTransformer}` classes to create a term-document matrix representation of the text.

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

get_tokens = lambda docs: [' '.join([t for sent in sent_tokenize(d) for t in word_tokenize(sent)]) for d in docs]

In [47]:
vectorizer = CountVectorizer()

train_unigram = vectorizer.fit_transform(get_tokens(data_train.data))
# do not use fit_transform once more like that:
# test_unigram_bad = vectorizer.fit_transform(get_tokens(data_test.data))
# instead let's use just the transform method
test_unigram = vectorizer.transform(get_tokens(data_test.data))

In [48]:
print(type(train_unigram), train_unigram.shape, test_unigram.shape)

<class 'scipy.sparse.csr.csr_matrix'> (1197, 18569) (796, 18569)


In [49]:
vectorizer.get_feature_names()[0:10]

['00',
 '000',
 '000256',
 '000th',
 '0010',
 '001211',
 '001323',
 '002',
 '002251w',
 '0023']

In [50]:
train_unigram.toarray()[0:3,0:10]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 2, 0]], dtype=int64)

In [52]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=3)
bigram_vectorizer_memory_inefficient = CountVectorizer(ngram_range=(1, 2), min_df=1)

## Exercise

Let us repeat the same vectorization but this time use consecutive n-grams as well.
In order to compansate for the increased amount of features, only keep ngrams that are present in at least 3 documents.

1. How does the shape of the term-document matrix change?
1. What are the first 10 features this time?

In [53]:
# Place your code here
train_bigram_df_ge_3 = bigram_vectorizer.fit_transform(get_tokens(data_train.data))
train_bigram = bigram_vectorizer_memory_inefficient.fit_transform(get_tokens(data_train.data))
print('%i vs %i' % train_bigram_df_ge_3.shape, train_bigram.shape)

1197 vs 27960 (1197, 148870)


## Let's improve upon simple counts

In [54]:
transformer = TfidfTransformer()

In [59]:
tfidf_train = transformer.fit_transform(train_unigram.toarray())
tfidf_test = transformer.transform(test_unigram.toarray())

In [60]:
tfidf_train[0:3,0:10].todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.23571696, 0.        ]])

In [61]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=10)

In [62]:
print("The tfidf matrix has shape %i x %i." % tfidf_train.shape)
print("There are %i many word forms distinguished." % len(vectorizer.get_feature_names()))

The tfidf matrix has shape 1197 x 18569.
There are 18569 many word forms distinguished.


In [63]:
tfidf_train_reduced = svd.fit_transform(tfidf_train.T)
tfidf_train_reduced.shape

word_to_id = {word:i for i, word in enumerate(vectorizer.get_feature_names())}

In [64]:
query_words = ['stick', 'puck', 'homerun']
query_word_ids = [word_to_id[w] for w in query_words]
query_vecs = tfidf_train_reduced[query_word_ids]

# Exercise

1. Write a method which calculates the cosine similarity between two vectors.
1. Write a method which given a matrix of vectors and a given vector returns the index of the most similar vector from the matrix according to the cosine similarity. (Do not feel bad about writing a non-optimized code for the moment.)

In [22]:
def calc_cosine(u, v):
    pass

In [23]:
def calc_most_similar(X, i):
    pass

In [24]:
for u_word, u_vec in zip(query_words, query_vecs):
    for v_word, v_vec in zip(query_words, query_vecs):
        print(u_word, v_word, calc_cosine(u_vec, v_vec))
        

stick stick None
stick puck None
stick homerun None
puck stick None
puck puck None
puck homerun None
homerun stick None
homerun puck None
homerun homerun None
