In [1]:
#meta 1/10/2024 NLP for UDS. Part 1. Data Prep
# Text Analytics. Trends and topic modeling for UDS
#      Data: How did you...
#      Task: Explore data before modeling

#      input: data/xx_howdidu_tidy.csv
      

#started from nlp_2_model.ipynb 
#need to explore data before deciding how to prep data for ML


#history
#1/10/2024 EXPLORE DATA  $ac


#Pipeline: nlp_0_data -> nlp_1_dataprep (here) -> nlp_2_model 

#$config $manual

In [2]:
import sys
import os
import pandas as pd
pd.set_option('display.max_colwidth', 50)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import csv
#from joblib import load, dump


In [3]:
# dataprep - tokenization
from sklearn.feature_extraction import text
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer #transformer to tokenize dataset, aka bag-of-words activity
from sklearn.feature_extraction.text import TfidfVectorizer #rescale features by how informative they are

# from sklearn.decomposition import LatentDirichletAllocation

# Import the wordcloud library
from wordcloud import WordCloud

# #plotly
# import plotly.express as px

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
print(sys.version)
np.__version__, pd.__version__

Global vars and functions

In [None]:
#data
DATA_DIR = 'data'
DATA_FILE_IN = DATA_DIR + '/howdidu_tidy.csv' #$config

#IF
FLAG_SUBSET = False #$config for initial confirming smaller counts

# NLP: Text Analytics 
Data preparation for Text Analytics 

## 0. Load Tidy Data

In [None]:
df_tidy = pd.read_csv(DATA_FILE_IN)

if FLAG_SUBSET:
    df_tidy = df_tidy[:20].copy() #$temp

print(df_tidy.shape)
print(df_tidy.columns)
df_tidy.head()

In [None]:
pd.set_option('display.max_colwidth', None)
df_tidy[df_tidy['Contact rpt. description'].str.contains('Passcode')] #other strings: '12532050468', '\'', '\$', 'donat'

In [None]:
pd.set_option('display.max_colwidth', None)
df_tidy[df_tidy['Contact rpt. description'].str.contains('ukrainedefensesupport')]

## 1. Prep Data for ML
Use `Contact rpt. description` for training

In [None]:
pd.set_option('display.max_colwidth', 50)

#feature engineering - concatentate text columns? here, no
df_tidy['all_text'] = df_tidy['Contact rpt. description'].str.replace('\s',' ', regex=True) #\s stands for “whitespace character”. In all flavors, it includes [ \t\r\n\f]
df_tidy['all_text'] = df_tidy['all_text'].str.lower()
#$manual rules
df_tidy['all_text'] = df_tidy['all_text'].str.replace('[\’\']s\s',' ', regex=True)
df_tidy['all_text'] = df_tidy['all_text'].str.replace('i[\’\']m\s','im ', regex=True)
df_tidy['all_text'] = df_tidy['all_text'].str.replace('y[\’\']all','you all', regex=True)
df_tidy['all_text'] = df_tidy['all_text'].str.replace('n/a','__na__') #replace n/a

print(df_tidy.shape)
df_tidy.tail()

### 1.1 Prepare for Text Analytics 

Assemble Data into ML Expected Format.  
Scikit-learn expects a Numpy array-like structure. Transform to a structure acceptable by algorithm: 
- input features X(matrix) aka `train set`  
- target variable y(vector) 

Here:  
- X - column 'all_text'  = `train set`  
- y - n/a

In [None]:
#interested in words in all rows, column 0
text_train = df_tidy['all_text'] #.iloc[:,0]
text_train.shape

In [None]:
text_train

### 1.2 Verify Text and Counts
by comparing with `Collections.Counter` and `CountVectorizer`

In [None]:
# Join 
long_string = ' '.join(list(text_train)) #keep case
len(long_string), long_string[:20]

In [None]:
long_string

### 1.2a `Counter()`
dict subclass for counting hashable objects

In [None]:
import re
# split on white-space: good but has problems with extra punctuation
# l_long_string = re.split('[,\s\.]+', long_string) #list
# long_string_tidy = ' '.join(l_long_string)

#split words followed by space only: good but has problems with missing numbers
# long_string_tidy = re.sub("[^\w ]", "", long_string)

#split [words / numbers / _] followed by space
long_string_tidy = re.sub("[^a-zA-Z0-9_\$]", " ", long_string)

long_string_tidy

In [None]:
# count word frequency & sort 
tokens = long_string_tidy.lower().split()
cc_word_counter = Counter(tokens) #class collections.Counter

#preview
print("cc_word_counter: ", len(cc_word_counter))
print("View cc_word_counter: ", cc_word_counter)
# list unique elements
#list(cc_word_counter)

In [None]:
#confirm counts
#  w/o Counter, acting like it's a dict datastruct
print("Unique tokens: {}, Total count: {}".format(len(cc_word_counter),cc_word_counter.total()))
#  w Counter, seems convoluted
print("Unique tokens: {}, Total count: {}".format(Counter(cc_word_counter.values()).total(), Counter(dict(cc_word_counter)).total()))


In [None]:
#confirm counts w/ N most common
N_MOST = 10
cc_word_counter_10 = cc_word_counter.most_common(N_MOST) #class list
print(cc_word_counter_10)
print("\n")

print("Unique tokens: {}, Total count: {}".format(len(cc_word_counter_10), sum(dict(cc_word_counter_10).values())))
print("Unique tokens: {}, Total count: {}".format(Counter(cc_word_counter_10).total(), Counter(dict(cc_word_counter_10)).total()))

In [None]:
#N least common
cc_word_counter.most_common()[:-N_MOST-1:-1]

### 1.2b `CountVectorizer()`
Convert a long string to a matrix of token counts

In [None]:
cv =  CountVectorizer(token_pattern='[a-zA-Z0-9_\$]+') #$config lcase lowercase=True, 
# 1) tokenize train data and build the vocabulary + 2)`transform` converts text to a bow representation: SciPy sparse matrix only stores non-zero entries
cv_fit = cv.fit_transform(text_train) #class scipy.sparse._csr.csr_matrix
cv_vocab = cv.get_feature_names_out() #class numpy.ndarray
print("Vocab: ", len(cv_vocab))
print("Preview vocab: ", cv_vocab[:10])

cv_fit.toarray().shape


In [None]:
#lookup token
cv_count_list = cv_fit.toarray().sum(axis=0)

d_all = dict(zip(cv_vocab,cv_count_list))

token = 'Carl'
try:
    this_token = token
    this_count = d_all[this_token]
except KeyError:
    this_token = token.lower()
    this_count = d_all[this_token]
finally:
    print(this_token, this_count)

In [None]:
cv_vocab, cv_count_list, cv_count_list.sum()

In [None]:
assert sum(cc_word_counter.values()) == cv_count_list.sum()

### 1.2c `TfidfVectorizer()`

Convert a collection of raw documents to a matrix of TF-IDF features  
Equivalent to `CountVectorizer` followed by `TfidfTransformer`

In [None]:
tfidf_v = TfidfVectorizer(token_pattern='[a-zA-Z0-9_\$]+')#$config lcase lowercase=True, 
# 1) tokenize train data and build the vocabulary + 2)`transform` converts text to a bow representation: SciPy sparse matrix only stores non-zero entries
tfidf_v_fit = tfidf_v.fit_transform(text_train) #class scipy.sparse._csr.csr_matrix
tfidf_v_vocab = tfidf_v.get_feature_names_out() #class numpy.ndarray
print("Vocab: ", len(tfidf_v_vocab))
print("Preview vocab: ", tfidf_v_vocab[:10])

tfidf_v_fit.toarray().shape

In [None]:
tfidf_v_count_list = tfidf_v_fit.toarray().sum(axis=0)
tfidf_v_vocab, tfidf_v_count_list, tfidf_v_count_list.sum()

### 1.2.1 Diff between CC, CV and TfidfV vocabs
eventually get to the point of no diff

In [None]:
#diff
#list(cc_word_counter), cv_vocab.tolist()

np.setdiff1d(list(cc_word_counter), cv_vocab.tolist()), np.setdiff1d(cv_vocab.tolist(), list(cc_word_counter)), np.setdiff1d(tfidf_v_vocab.tolist(), list(cc_word_counter))

In [None]:
#save 3 vocabs
try:
    with open('data/my_cc_vocab.csv', 'w', newline='', encoding='UTF-8') as myfile:
        wr = csv.writer(myfile, delimiter = '|')
        wr.writerows([r] for r in cc_word_counter)
except:
    print("An error occurred")
    for e in sys.exc_info():
        print("Error details: {}".format(str(e)))
        
try:
    with open('data/my_cv_vocab.csv', 'w', newline='', encoding='UTF-8') as myfile:
        wr = csv.writer(myfile, delimiter = '|')
        wr.writerows([r] for r in cv_vocab)
except:
    print("An error occurred")
    for e in sys.exc_info():
        print("Error details: {}".format(str(e)))

try:
    with open('data/my_tfidf_v_vocab.csv', 'w', newline='', encoding='UTF-8') as myfile:
        wr = csv.writer(myfile, delimiter = '|')
        wr.writerows([r] for r in tfidf_v_vocab)
except:
    print("An error occurred")
    for e in sys.exc_info():
        print("Error details: {}".format(str(e)))


- EDA - with WordCloud
src https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [None]:
# Create a WordCloud object
wordcloud_raw = WordCloud(background_color="white", max_words=5000, include_numbers=True, collocation_threshold=5, width=600, height=300, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud_raw.generate(long_string_tidy)
# Visualize the word cloud
wordcloud_raw.to_image()

### 1.3 Real Vectorizing

### 1.3a `CountVectorizer()`
with trigrams

In [None]:
cv2 =  CountVectorizer(token_pattern='[a-zA-Z0-9_\$]+', ngram_range=(1,3)) 
cv2_fit = cv2.fit_transform(text_train) #class scipy.sparse._csr.csr_matrix
cv2_vocab = cv2.get_feature_names_out() #class numpy.ndarray
print("Vocab: ", len(cv2_vocab))
print("Preview vocab: ", cv2_vocab[:10])

cv2_fit.toarray().shape

remove `stopwords`

In [None]:
cv3 =  CountVectorizer(token_pattern='[a-zA-Z0-9_\$]+', ngram_range=(1,3), stop_words='english') 
cv3_fit = cv3.fit_transform(text_train) #class scipy.sparse._csr.csr_matrix
cv3_vocab = cv3.get_feature_names_out() #class numpy.ndarray
print("Vocab: ", len(cv3_vocab))
print("Preview vocab: ", cv3_vocab[:10])
#print("cv3_fit:\n{}".format(repr(cv3_fit)))

cv3_fit.toarray().shape

- Diff between CVs trigrams  
removed `stopwords` vs not

In [None]:
#preview diff
np.setdiff1d(cv3_vocab, cv2_vocab), np.setdiff1d(cv2_vocab, cv3_vocab)

In [None]:
cv2_count_list = cv2_fit.toarray().sum(axis=0)
cv2_vocab, cv2_count_list, cv2_count_list.sum()

In [None]:
cv3_count_list = cv3_fit.toarray().sum(axis=0)
cv3_vocab, cv3_count_list, cv3_count_list.sum()

### 1.3b `TfidfVectorizer()`

with trigrams

In [None]:
tfidf_v2 = TfidfVectorizer(token_pattern='[a-zA-Z0-9_\$]+', ngram_range=(1,3))
tfidf_v2_fit = tfidf_v2.fit_transform(text_train)
tfidf_v2_vocab = tfidf_v2.get_feature_names_out() #class numpy.ndarray
print("Vocab: ", len(tfidf_v2_vocab))
print("Preview vocab: ", tfidf_v2_vocab[:10])

tfidf_v2_fit.toarray().shape

In [None]:
#confirm same n-grams
assert len(cv2_vocab) == len(tfidf_v2_vocab)

remove `stopwords`

In [None]:
tfidf_v3 = TfidfVectorizer(token_pattern='[a-zA-Z0-9_\$]+', ngram_range=(1, 3), stop_words="english")
tfidf_v3_fit = tfidf_v3.fit_transform(text_train)
tfidf_v3_vocab = tfidf_v3.get_feature_names_out() #class numpy.ndarray
print("Vocab: ", len(tfidf_v3_vocab))
print("Preview vocab: ", tfidf_v3_vocab[:10])

tfidf_v3_fit.toarray().shape

In [None]:
#confirm same n-grams
assert len(cv3_vocab) == len(tfidf_v3_vocab)

- Diff between TfidfVs trigrams  
removed `stopwords` vs not

In [None]:
#preview diff
np.setdiff1d(tfidf_v3_vocab, tfidf_v2_vocab), np.setdiff1d(tfidf_v2_vocab, tfidf_v3_vocab)

In [None]:
tfidf_v2_count_list = tfidf_v2_fit.toarray().sum(axis=0)
tfidf_v2_vocab, tfidf_v2_count_list, tfidf_v2_count_list.sum()

In [None]:
tfidf_v3_count_list = tfidf_v3_fit.toarray().sum(axis=0)
tfidf_v3_vocab, tfidf_v3_count_list, tfidf_v3_count_list.sum()

with `min_df` or `max_df`?   
'min_df' gets rid of important tokens

In [None]:
tfidf_v4 = TfidfVectorizer(token_pattern='[a-zA-Z0-9_\$]+', ngram_range=(1, 3), stop_words="english", max_df = .1)
tfidf_v4_fit = tfidf_v4.fit_transform(text_train)
tfidf_v4_vocab = tfidf_v4.get_feature_names_out() #class numpy.ndarray
print("Vocab: ", len(tfidf_v4_vocab))
print("Preview vocab: ", tfidf_v4_vocab[:10])

tfidf_v4_fit.toarray().shape

In [None]:
tfidf_v4_count_list = tfidf_v4_fit.toarray().sum(axis=0)
tfidf_v4_vocab, tfidf_v4_count_list, tfidf_v4_count_list.sum()

- Diff between TfidfVs trigrams  
with `max_df` vs not

In [None]:
#preview diff
np.setdiff1d(tfidf_v4_vocab, tfidf_v3_vocab), np.setdiff1d(tfidf_v3_vocab, tfidf_v4_vocab)


In [None]:
uds_stopwords = np.setdiff1d(tfidf_v3_vocab, tfidf_v4_vocab)

In [None]:
#save final vocab
try:
    with open('data/my_vocab.csv', 'w', newline='', encoding='UTF-8') as myfile:
        wr = csv.writer(myfile, delimiter = '|')
        wr.writerows([r] for r in tfidf_v4_vocab)
except:
    print("An error occurred")
    for e in sys.exc_info():
        print("Error details: {}".format(str(e)))

- EDA Final Vectorizer

In [None]:
# find min and max features over dataset:
v_min = tfidf_v4_count_list.min()
v_max = tfidf_v4_count_list.max() 
print("Min {} and Max {} ".format(np.round(v_min, 4), v_max))

In [None]:
idx_min = np.where(tfidf_v4_count_list == v_min)
idx_max = np.where(tfidf_v4_count_list == v_max)

print("Min feature(s): ", tfidf_v4_vocab[idx_min])
print("\nMax feature(s): ", tfidf_v4_vocab[idx_max])

In [None]:
tfidf_v4_fit.max(axis=0).toarray(), tfidf_v4_fit.toarray()

In [None]:
# find maximum value for each of the features over dataset:
vals_max = tfidf_v4_fit.max(axis=0).toarray().ravel() #class numpy.ndarray
print("Max value shape:", vals_max.shape)

sorted_by_tfidf = vals_max.argsort()

print("Features with lowest tfidf:\n{}".format(tfidf_v4_vocab[sorted_by_tfidf[:20]]))

print("Features with highest tfidf: \n{}".format(tfidf_v4_vocab[sorted_by_tfidf[-20:]]))

sorted_by_idf = np.argsort(tfidf_v4.idf_)
print("Features with lowest idf:\n{}".format(tfidf_v4_vocab[sorted_by_idf[:20]]))

- EDA Final Vectorizer WordCloud

In [None]:
#$actodo https://stackoverflow.com/questions/53997443/how-to-add-extra-stop-words-in-addition-to-default-stopwords-in-wordcloud
my_stop_words = list(uds_stopwords) + list(text.ENGLISH_STOP_WORDS)
len(my_stop_words), len(uds_stopwords), len(text.ENGLISH_STOP_WORDS)

In [None]:
# Create a WordCloud object
wordcloud = WordCloud(stopwords = my_stop_words, background_color="white", max_words=5000, include_numbers=True, collocation_threshold=5, width=600, height=300, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()

In [None]:
mystop

In [None]:
wordcloud = WordCloud(stopwords = my_stop_words, background_color="white", width=800, height=400,).generate(long_string) #
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
mystop

In [None]:
#  NOT Join tokens together.
#long_string = ','.join(list(text_train))
long_string = ','.join(tfidf_v4.vocabulary_)
len(long_string)

# Create a WordCloud object
wordcloud = WordCloud(stopwords = "english", background_color="white", max_words=5000, include_numbers=True, collocation_threshold=5, width=600, height=300, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()

In [None]:
mystop

## Xtra

In [None]:
#Python - sum of all counts in collections.Counter
# refer to https://testdriven.io/tips/6729e7af-9482-4b37-a780-fab42b709841/
from collections import Counter

pencil_stock = Counter({"Red": 17, "Blue": 5, "Green": 9}) #Counter for dicts

print(len(pencil_stock), pencil_stock.total())

# Xtra

#### Bag-of-Words
- Tokenize dataset and build the vocablulary  
- Revew vocabulary and features  
- Create bow representation of training data - SciPy sparse matrix
- dtm "dense" NumPy array to look at the actual content

In [None]:
#by default extracts tokens using a regex "\b\w\w+\b" - all sequences of chars that consist of at least two letters or numbers (\w) and that are separated by word boundaries(\b) => no single letter words, splits `don't` or `bit.ly`
vectorizer = CountVectorizer(min_df=0)

# `fit` to tokenize train data and build the vocabulary - <class 'dict'> {word,index}
vectorizer.fit(text_train.iloc[:10])

# access vocab with attribute vocabulary_ <class 'dict'>
print("Vocab size: {}".format(len(vectorizer.vocabulary_)))
print("Vocab (with word indices): {}".format(vectorizer.vocabulary_))

In [None]:
# get features
print("Vocab size:{}".format(len(vectorizer.get_feature_names())))
print("Vocab (ordered alphabetically): {}".format((vectorizer.get_feature_names())))

In [None]:
# `transform` to convert text to a bag of words
bow = vectorizer.transform(text_train.iloc[:10]) 
#SciPy sparse matrix only stores non-zero entries

print("Bag-of-words: {}".format(repr(bow)))

In [None]:
# CountVectorizer uses a sparse array to save memory
# convert to a "dense" NumPy array to look actual content
dtm = bow.toarray()

print ("document-term dimensions:", dtm.shape)
print (dtm)


Manually get word frequencies

In [None]:
#10 records x 121 words
print (dtm.shape)

#each row=record, how many words each?
print(np.sum(dtm, axis=1))

#each column=word, frequency of each word?
print(np.sum(dtm, axis=0))

#confirm sums
np.sum(dtm, axis=1).sum(), np.sum(dtm, axis=0).sum()

In [None]:
#$xtra save vocab base 
csv_columns = ['word','idx']
csv_file = "myExplore/vocab_ngrams3_index.csv"
try:
    with open(csv_file, 'w') as f:
        for key in vect_ngram.vocabulary_.keys():
            f.write("%s,%s\n"%(key,vect_ngram.vocabulary_[key]))
except IOError:
    print("I/O error")

toGist $actodo

In [None]:
#save vocab c1 
csv_columns = ['word','idx']
csv_file = "myExplore/vocab_base_index_tfidif_min001.csv"
try:
    with open(csv_file, 'w') as f:
        for key in vect_c2.vocabulary_.keys():
            f.write("%s,%s\n"%(key,vect_c2.vocabulary_[key]))
except IOError:
    print("I/O error")
