In [1]:
# Auto Reload
%load_ext autoreload
%autoreload 2

In [None]:
# Import Modules
import csv
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import json
from pprint import pprint
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from math import log
import re
import os
from gensim import corpora
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
from wordcloud import WordCloud

# NLTK Modules
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('tagsets')
from nltk import sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import chunk
from nltk.util import ngrams

# Import Custom Modules
from src.data_cleaner import *
from src.dummy_words import *

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
# Import Data
clean_df = pd.read_pickle('data/clean_data.pkl')
# Rid Period from clean text
clean_df['clean_text'] = clean_df['clean_text'].apply(lambda x: "".join(x.split(".")))
# clean_df['clean_text'][0]

In [None]:
# See Current Features
# clean_df.columns
clean_df.head(2)

# Text Mining Algorithms

### Vectorizing: Term Frequency-Inverse Document Frequency (TF-IDF)

> #### Getting TERM FREQUENCY
The number of times a term occurs in a specific document: 

$tf(term,document) = \frac{\# \ of \ times \ a \ term \ appears \ in \ a \ document}{\#\ of\ terms\ in\ the\ document|}$

In [None]:
# Add Features to DataFrame of Term Occurences
clean_df['term_occurences'] = clean_df['tokens'].apply(lambda x: Counter(x))
# clean_df['term_occurences'][0]

Deciding to use the tokens where the stop-words were NOT filtered out

In [None]:
# Add Features to DataFrame of Term Frequency
clean_df['term_frequency'] = [{k: (v / float(len(clean_df['tokens'].iloc[i])))
                       for k, v in clean_df['term_occurences'].iloc[i].items()} for i in range(len(clean_df['term_occurences']))]
# clean_df['term_frequency'][0]

> #### Getting DOCUMENT FREQUENCY

$df(term,corpus) = \frac{ \# \ of \ documents \ that \ contain \ a \ term}{ \# \ of \ documents \ in \ the \ corpus}$


In [None]:
# Add Features to DataFrame of Doc Occurences
doc_occ = Counter([word for bow in clean_df['tokens'] for word in set(bow)])
# doc_occ

In [None]:
# Add Features to DataFrame of Term Frequency
doc_freq =  {k: (v / float(len(clean_df['tokens'])))
            for k, v in doc_occ.items()}
# doc_freq

> #### TFIDF vector

$df(term,corpus) = \frac{ \# \ of \ documents \ that \ contain \ a \ term}{ \# \ of \ documents \ in \ the \ corpus}$

The inverse document frequency is defined in terms of the document frequency as

$idf(term,corpus) = \log{\frac{1}{df(term,corpus)}}$.

TF-IDF is an acronym for the product of two parts: the term frequency tf and what is called the inverse document frequency idf. The term frequency is just the counts in a term frequency vector. 

tf-idf $ = tf(term,document) * idf(term,corpus)$

In [None]:
tf_vectorizer = TfidfVectorizer()
vec = tf_vectorizer.fit_transform(clean_df['clean_text'])
vector_df_tf = pd.DataFrame(vec.toarray().transpose(),
                         index=tf_vectorizer.get_feature_names())
vector_df_tf

#### Testing Count Vectorizer

In [None]:
count_vec = CountVectorizer()
vec = count_vec.fit_transform(clean_df['clean_text'])
vector_df_cnt = pd.DataFrame(vec.toarray().transpose(),
                         index=count_vec.get_feature_names())
vector_df_cnt

# TOPIC MODELING

In [None]:
# Create Corpus
corpus = clean_df['clean_text'].tolist()
tokens = [word_tokenize(doc) for doc in corpus]
tokens_stop = clean_df['tokens_stop'].tolist() 


In [None]:
# Create Yearly Corpus
grouped = clean_df.groupby('year')

group_2019 = clean_df.iloc[grouped.groups[2019]].copy()
group_2018 = clean_df.iloc[grouped.groups[2018]].copy()
group_2017 = clean_df.iloc[grouped.groups[2017]].copy()
group_2016 = clean_df.iloc[grouped.groups[2016]].copy()
group_2015 = clean_df.iloc[grouped.groups[2015]].copy()
group_list = [group_2015, group_2016, group_2017, group_2018, group_2019]

In [None]:
corpus[0][:100], tokens[0][:5], tokens_stop[0][:5]

## Looking at words by Year

In [None]:
stops = ['im', 'thats', 'ya', 'though', 'yeah']
custom_stop = ['yeah', 'like', 'got', '2018', 'know', 'get', 'aint', 'ayy', 'go', 'na', 'back', 'one', 'gon', 'make', 'wan', 'thats', 'need', 'oh', 'see', 'feat', 'ooh', 'said', 'way', "2017"] + stops
# Join the different processed titles together
long_string = ','.join([",".join(tokens) for tokens in tokens_stop])

# Create a WordCloud object
wordcloud = WordCloud(background_color="black", max_words=500, contour_width=3,  width=800, height=400, stopwords=custom_stop)

# Generate a word cloud
wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()

## Plot Most Common Words from CounterVectorizer

In [None]:
count_vec = CountVectorizer()
vec = count_vec.fit_transform([" ".join(set_) for set_ in clean_df['tokens_stop']])
vector_df_cnt = pd.DataFrame(vec.toarray().transpose(),
                         index=count_vec.get_feature_names())

# Visualise the 10 most common words
plot_10_most_common_words(vec, count_vec)

### Plot again after removing custom stop words (yeah, like, got, 2018, know, get, aint)

In [None]:
count_vec = CountVectorizer()
vec = count_vec.fit_transform([" ".join([x for x in set_ if x not in custom_stop]) for set_ in clean_df['tokens_stop']])
vector_df_cnt = pd.DataFrame(vec.toarray().transpose(),
                         index=count_vec.get_feature_names())

# Visualise the 10 most common words
plot_10_most_common_words(vec, count_vec, '10 Most Common Words w/ Custom Stops')

In [None]:
fig = plt.figure(figsize=(15, 10))

# fig=plt.figure(figsize=(10,7))
# columns = 3
# rows = 2
# a=np.random.rand(2,3)
# for i in range(1, 6):
#     fig.add_subplot(rows, columns, i)
#     plt.plot(a)### what you want you can plot  

for year, group, i in zip([2015, 2016, 2017, 2018, 2019], group_list, range(1,6)):
    count_vec = CountVectorizer()
    vec = count_vec.fit_transform([" ".join([x for x in set_ if x not in custom_stop]) for set_ in group['tokens_stop']])
    vector_df_cnt = pd.DataFrame(vec.toarray().transpose(),
                             index=count_vec.get_feature_names())
    words = count_vec.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in vec:
        total_counts+=t.toarray()[0]

    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 

    fig.add_subplot(2, 3, i)
    sns.set_context("notebook", font_scale=1.55, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.title(year)
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')

fig.tight_layout()

## LDA Latent Dirichlet Allocation

## NMF Non Negative Matrix Factorization