## Lab 4: Natural Language Processing


#### CSC 180  Intelligent Systems (Spring 2020)

#### Dr. Haiquan Chen, California State University, Sacramento

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

## How to vectorize natural lauguage data?

Python offers a set of tools for extracting features:http://scikit-learn.org/stable/modules/feature_extraction.html

### CountVectorizer: transforms text into a "sparse matrix" where rows are text and columns are words, and values are occurrence values.

In [19]:
import sklearn.feature_extraction.text as sk_text

vectorizer = sk_text.CountVectorizer(min_df=1)
#vectorizer = sk_text.CountVectorizer(min_df=1, stop_words = 'english')

#min_df: ignore terms that have a document frequency < min_df.

corpus = ['This is the first document.',
           'this is the second second document.',
           'And the third one.',
           'Is this the first first first document?',
          ]

matrix = vectorizer.fit_transform(corpus)

print(type(matrix))          # Compressed Sparse Row matrix
print(matrix.toarray())        #  convert it to numpy array

<class 'scipy.sparse.csr.csr_matrix'>
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 3 1 0 0 1 0 1]]


In [20]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


### TfIdfVectorizer: transforms text into a "sparse matrix" where rows are text and columns are words, and values are the tf-dif values. 

More here: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer

In [21]:
vectorizer = sk_text.TfidfVectorizer(
                             #stop_words='english',
                             max_features = 1000,
                             min_df=1)


#max_features:  build a vocabulary that only consider the top max_features features ordered by term frequency across the corpus.

matrix = vectorizer.fit_transform(corpus)

print(type(matrix))          # Compressed Sparse Row matrix
print(matrix.toarray())        #  convert it to numpy array


<class 'scipy.sparse.csr.csr_matrix'>
[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.23973261 0.88835239 0.23973261 0.         0.
  0.19599711 0.         0.23973261]]


In [5]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [22]:
vectorizer = sk_text.TfidfVectorizer(stop_words='english',
                             #max_features = 1000,
                             min_df=2, 
                             max_df=500
                             )

#min_df: ignore terms that have a document frequency < min_df.
#max_df: ignore terms that have a document frequency > max_df


matrix = vectorizer.fit_transform(corpus)
print(type(matrix))               # Compressed Sparse Row matrix

tfidf_data = matrix.toarray()     #  convert it to numpy array

print(tfidf_data)

<class 'scipy.sparse.csr.csr_matrix'>
[[1.]
 [1.]
 [0.]
 [1.]]


In [23]:
print(vectorizer.get_feature_names())

['document']


### Once you vectorize the text, you can send the data to models

An example of what we want to do:
http://scikit-learn.org/stable/auto_examples/text/document_clustering.html

In [8]:
tfidf_data

array([[0.43877674, 0.54197657, 0.43877674, 0.35872874, 0.43877674],
       [0.52210862, 0.        , 0.52210862, 0.42685801, 0.52210862],
       [0.        , 0.        , 0.        , 1.        , 0.        ],
       [0.23973261, 0.88835239, 0.23973261, 0.19599711, 0.23973261]])

In [9]:
tfidf_data.shape

(4, 5)

## Another example:

In [10]:
document_0 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
document_1 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
document_2 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
document_3 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
document_4 = "Vladimir Putin is riding a horse while hunting deer. Vladimir Putin always seems so serious about things - even riding horses. Is he crazy?"


corpus = [document_0, document_1, document_2, document_3, document_4]

In [27]:
vectorizer = sk_text.TfidfVectorizer(stop_words='english',
                             max_features = 100,
                             min_df=1, 
                             #max_df=5
                             )

#min_df: ignore terms that have a document frequency < min_df.
#max_df: ignore terms that have a document frequency > max_df


matrix = vectorizer.fit_transform(corpus)

tfidf_data = matrix.toarray()     #  convert it to numpy array
#print(tfidf_data)
#print(tfidf_data.shape)
print(vectorizer.get_feature_names())
#print(len(vectorizer.get_feature_names()))

['document', 'second']
