# Introduction to Text Mining Part 2 - Exercises with Answers

## Exercise 1

#### Task 1 
##### Load the libraries that are used in this module.

#### Result:

In [None]:
# Helper packages.
import os
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

# Packages with tools for text processing.
import nltk
from wordcloud import WordCloud

# Packages for working with text data.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Packages for getting data ready for and building a LDA model
import gensim
from gensim import corpora, models
from pprint import pprint

#### Task 2 
##### Set `main_dir` to the location of your `booz-allen-hamilton` folder.
##### Make `data_dir` from the `main_dir` and concatenate remainder of the path to data directory.

#### Result:

In [None]:
from pathlib import Path
# Set `home_dir` to the root directory of your computer.
home_dir = Path.home()

# Set `main_dir` to the location of your `booz-allen-hamilton` folder.
main_dir = home_dir / "Desktop" / "booz-allen-hamilton"

# Make `data_dir` from the `main_dir` and remainder of the path to data directory.
data_dir = main_dir / "data"

#### Task 3 
##### Set the working directory to `data_dir`.
##### Check if the working directory is updated to `data_dir`.

#### Result:

In [None]:
# Change the working directory.
os.chdir(data_dir)

# Check the working directory.
print(os.getcwd())

#### Task 4 
##### Load the pickled files from the previous exercises: `ex_titles_clean.sav`, `ex_corpus_freq_dist` and `ex_titles_clean_list`.
##### Save them as `processed_docs_ex`, `ex_corpus_freq_dist` and `titles_clean_list`.

#### Result:

In [None]:
processed_docs_ex = pickle.load(open("ex_titles_clean.sav","rb"))
ex_corpus_freq_dist = pickle.load(open("ex_corpus_freq_dist.sav","rb"))
titles_clean_list = pickle.load(open("ex_titles_clean_list.sav","rb"))

#### Task 5
##### Plot the frequency distribution of words in `ex_corpus_freq_dist` .

#### Result:

In [None]:
# Save as a FreqDist object native to nltk.
ex_corpus_freq_dist = nltk.FreqDist(ex_corpus_freq_dist)

# Plot distribution for the entire corpus.
plt.figure(figsize = (16, 7))
ex_corpus_freq_dist.plot(80)

#### Task 6
##### Create a word cloud of the entire corpus and name it `ex_wordcloud`.
##### Plot the wordcloud and set `figsize` to` (14, 7)`.

#### Result:

In [None]:
# Word cloud from corpus.
ex_wordcloud = WordCloud(max_font_size = 40, background_color = "white", collocations = False)
ex_wordcloud = ex_wordcloud.generate(' '.join(titles_clean_list))

# Plot the cloud using matplotlib.
plt.figure(figsize = (14, 7))
plt.imshow(ex_wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show()

## Exercise 2

#### Task 1
##### Check the length of `processed_docs_ex` using `len()` function.
##### Create a dictionary from `processed_docs_ex` object, using `gensim.corpora.Dictionary` function.
##### Label the dictionary `dictionary_ex`.
##### Then loop through the dictionary printing out the first 10 items, including key and value.
##### Make sure to set the seed as `2` for exercises.
##### Use `.filter_extremes()` to filter items. Set `no_below` as `5`, `no_above` as `0.5` and `keep_n` as `942`.

#### Hint:

#####  To check the length, use `len(processed_docs_ex)`.

#### Result:

In [None]:
len(processed_docs_ex)

In [None]:
# Set the seed. 
np.random.seed(2)

dictionary_ex = gensim.corpora.Dictionary(processed_docs_ex)

# The loop below iterates through the first 10 items of the dictionary and prints out the key and value. 
count = 0
for k, v in dictionary_ex.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

# Key stands for the order of the word within all the words in the corpus, words are in alphabetical order.
# Value stands for the actual word.

In [None]:
len(dictionary_ex)

In [None]:
dictionary_ex.filter_extremes(no_below = 5, no_above = 0.5, keep_n = 942)

#### Task 2
##### Write a list comprehension that transforms each doc within the `processed_docs_ex` .
##### Save this object, the output of the list comprehension, as `bow_corpus_ex`.
##### What type of object is this?

#### Result:

In [None]:
bow_corpus_ex = [dictionary_ex.doc2bow(doc) for doc in processed_docs_ex]

# What type of object is this?
type(bow_corpus_ex)

#### Task 3
##### Transform `bow_corpus_ex` to a TF-IDF transformed object using `TfIdfModel()`.
##### Name the object as `corpus_tfidf_ex`.
##### Preview the scores for the first document in `corpus_tfidf_ex` using `pprint`.

#### Result:

In [None]:
# This is the transformation.
tfidf = models.TfidfModel(bow_corpus_ex)

# Apply the transformation to the entire corpus.
corpus_tfidf_ex = tfidf[bow_corpus_ex]

# Preview TF-IDF scores for the first document.
for doc in corpus_tfidf_ex:
    pprint(doc)
    break

#### Task 4
##### Pickle `bow_corpus_ex`, `corpus_tfidf_ex` and `dictionary_ex` for the next session.

#### Result:

In [None]:
pickle.dump(bow_corpus_ex, open('bow_corpus_ex.sav', 'wb'))
pickle.dump(corpus_tfidf_ex, open('corpus_tfidf_ex.sav', 'wb'))
pickle.dump(dictionary_ex, open('dictionary_ex.sav', 'wb'))