# Create TF-IDF Model from corpus on news articles

In [5]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# NLP Packages
import gensim
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full
import numpy as np
import spacy
nlp  = spacy.load('en_core_web_sm')

# Custom functions
from nlp_functions import preprocess_spacy, preprocess_docs,doc_embed_charity_notfidf

np.random.seed(400)
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## 0.0 Load Data

In [8]:
root_dir =  os.path.abspath(os.path.join(os.getcwd() ,"../../"))
file_name = os.path.join(root_dir,'raw_data','news_data_all.csv')
all_news = pd.read_csv(file_name)

## 1.0 Trim and Generate Features from News Data

In [10]:
# Trim to only columns we care about
all_news_use = all_news[['title','publication','content']]

# Drop any rows taht dont have a title, publication or content
all_news_use = all_news_use.dropna()

#Create article size feature that counts number of words in article
all_news_use['article_length']= all_news_use.apply(lambda x: len(x['content'].split(" ")),axis=1)

Check the distribution of article lenghts and only include middle 50% distrubution mass to exclude shorter and longer articles

In [13]:
# check 
all_news_use['article_length'].describe()

count    188725.000000
mean        834.514844
std         863.773999
min           1.000000
25%         368.000000
50%         672.000000
75%        1052.000000
max       50517.000000
Name: article_length, dtype: float64

## 3.0 Preprocess Article Text

In [14]:
all_news_use = all_news_use[all_news_use['article_length'].between(368, 1052, inclusive=True)]

In [7]:
articles_text  = all_news_use['content'].astype('str')

# Preprocess article text
articles_text_pre = preprocess_docs(articles_text)

## 4.0 Train TF-IDF Model

In [13]:
def news_tfidf(processed_docs,word_min=5, word_max_perc=.2):
    
    'Assumes docs have already been pre-processed'
    
    #Create dictionary from corpus
    docs_dict = Dictionary(processed_docs)
    docs_dict.filter_extremes(no_below=word_min, no_above=word_max_perc)
    docs_dict.compactify()
    
    #Convert docs into tf-idf vectors
    docs_corpus = [docs_dict.doc2bow(doc) for doc in processed_docs]
    model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
    docs_tfidf  = model_tfidf[docs_corpus]
        
    return docs_dict, model_tfidf, docs_tfidf

In [58]:
word_min=300
word_max_perc=.2
news_dict, news_tfidf_model, news_docs_tfidf = news_tfidf(articles_text_pre,word_min=100,word_max_perc=.4)

## 5.0 Save Preprocessed Articles and Model

Save preprocesed articles in case I want to retrain model with different parameters 

In [21]:
import pickle
with open(os.path.join(root_dir,"raw_dir","articles_text_preprocessed.txt", "wb") as fp:   #Pickling
    pickle.dump(articles_text_pre, fp)

In [74]:
#Store output in dictionary and save
news_tfidf_dict = {'news_dict': news_dict, 
                      'news_tfidf_model': news_tfidf_model}

with open(os.path.join(root_dir,'Giver','models','news','news_tfidf_min_{}_max_{}.pickle'.format(word_min,word_max_perc), 'wb') as handle:
    pickle.dump(news_tfidf_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)