# Unsupervised Natural Language Processing

In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
# download corpora required by this notebook
import nltk
from nltk.corpus import gutenberg
nltk.download('punkt')
nltk.download('gutenberg')
import re
from sklearn.model_selection import train_test_split

#reading in the data, this time in the form of paragraphs
emma=gutenberg.paras('austen-emma.txt')
#processing
emma_paras=[]
for paragraph in emma:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    emma_paras.append(' '.join(para))

print(emma_paras[0:4])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amichai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Amichai\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


['[ Emma by Jane Austen 1816 ]', 'VOLUME I', 'CHAPTER I', 'Emma Woodhouse , handsome , clever , and rich , with a comfortable home and happy disposition , seemed to unite some of the best blessings of existence ; and had lived nearly twenty - one years in the world with very little to distress or vex her .']


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(emma_paras, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
emma_paras_tfidf=vectorizer.fit_transform(emma_paras)
print("Number of features: %d" % emma_paras_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(emma_paras_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[5])
print('Tf_idf vector:', tfidf_bypara[5])

Number of features: 1948
Original sentence: A very few minutes more , however , completed the present trial .
Tf_idf vector: {'minutes': 0.7127450310382584, 'present': 0.701423210857947}


In [4]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])


Percent variance captured by all components: 45.18075106698598
Component 0:
" Oh !    0.999289
" Oh !    0.999289
Oh !      0.999289
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
" Oh !    0.999289
Name: 0, dtype: float64
Component 1:
That is _court_ .                                                                                                                                                           0.646419
" You have made her too tall , Emma ," said Mr . Knightley .                                                                                                                0.633557
" You get upon delicate subjects , Emma ," said Mrs . Weston smiling ; " remember that I am here . Mr .                                                                     0.573900
" I do not know what your opinion may be , Mrs . Weston ," said Mr . Knightley , " of this great intimacy between Emma and Harriet Smith , but I think it 

## Drill 0: Test set
Apply our LSA model to the test set.  Does it identify similar sentences for components 0 through 4? 

In [5]:
# Run SVD on the testing data, then project the test data.
X_test_lsa = lsa.fit_transform(X_test_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_test_lsa,index=X_test)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])


Percent variance captured by all components: 49.3823373292579
Component 0:
" Oh !    0.999922
" Oh !    0.999922
" Oh !    0.999922
" Oh !    0.999922
" Oh !    0.999922
" Oh !    0.999922
" Oh !    0.999922
" Oh !    0.999922
" Oh !    0.999922
" Oh !    0.999922
Name: 0, dtype: float64
Component 1:
" Well , Mrs . Weston ," said Emma triumphantly when he left them , " what do you say now to Mr . Knightley ' s marrying Jane Fairfax ?"                                                                                                                                                                                                                                                                                                             0.615158
After tea , Mr . and Mrs . Weston , and Mr . Elton sat down with Mr . Woodhouse to cards .                                                                                                                                                                  

## Drill 1: Tweaking tf-idf

Go back up to the code where we originally translated the text from words to numbers.  There are a lot of decision-points here, from the stop list to the thresholds for inclusion and exclusion, and many others as well.  We also didn't integrate spaCy, and so don't have info on lemmas or Named Entities.  Change things up a few times and see how that affects the results of the LSA.  

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(emma_paras, test_size=0.4, random_state=0)

for max_df in [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    for min_df in [2, 3, 4, 5, 6, 7]:
        vectorizer = TfidfVectorizer(max_df=max_df, # drop words that occur in more than half the paragraphs
                                     min_df=min_df, # only use words that appear at least twice
                                     stop_words='english', 
                                     lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                                     use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                                     norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                                     smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                                    )


        #Applying the vectorizer
        emma_paras_tfidf=vectorizer.fit_transform(emma_paras)
        print("Number of features: %d" % emma_paras_tfidf.get_shape()[1])

        #splitting into training and test sets
        X_train_tfidf, X_test_tfidf= train_test_split(emma_paras_tfidf, test_size=0.4, random_state=0)


        #Reshapes the vectorizer output into something people can read
        X_train_tfidf_csr = X_train_tfidf.tocsr()

        #number of paragraphs
        n = X_train_tfidf_csr.shape[0]
        #A list of dictionaries, one per paragraph
        tfidf_bypara = [{} for _ in range(0,n)]
        #List of features
        terms = vectorizer.get_feature_names()
        #for each paragraph, lists the feature words and their tf-idf scores
        for i, j in zip(*X_train_tfidf_csr.nonzero()):
            tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

        #Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
        #print('Original sentence:', X_train[5])
        #print('Tf_idf vector:', tfidf_bypara[5])


        #Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
        svd= TruncatedSVD(130)
        lsa = make_pipeline(svd, Normalizer(copy=False))
        # Run SVD on the training data, then project the training data.
        X_train_lsa = lsa.fit_transform(X_train_tfidf)

        variance_explained=svd.explained_variance_ratio_
        total_variance = variance_explained.sum()
        print("For max_df of {} and min_df of {}:".format(max_df, min_df))
        print("Percent variance captured by all components:",total_variance*100)
        print('\n')

Number of features: 1948
For max_df of 0.4 and min_df of 2:
Percent variance captured by all components: 45.21140827545456


Number of features: 1358
For max_df of 0.4 and min_df of 3:
Percent variance captured by all components: 48.45894660916231


Number of features: 1020
For max_df of 0.4 and min_df of 4:
Percent variance captured by all components: 52.405417184824685


Number of features: 799
For max_df of 0.4 and min_df of 5:
Percent variance captured by all components: 55.86226974301525


Number of features: 670
For max_df of 0.4 and min_df of 6:
Percent variance captured by all components: 58.318501992214024


Number of features: 560
For max_df of 0.4 and min_df of 7:
Percent variance captured by all components: 61.325896234163665


Number of features: 1948
For max_df of 0.5 and min_df of 2:
Percent variance captured by all components: 45.19309340783598


Number of features: 1358
For max_df of 0.5 and min_df of 3:
Percent variance captured by all components: 48.45089868588366




Using LSA, if we set the min_df value to 7 (i.e. if we only consider terms that appear at least 7 times), we can yield an explanation of 61% of the variance in the training set data. 