In [1]:
## note that active directory is already set in thfunction_file_news_corpus
## Keep all files in the same active dir
## refer to line 48 and line 49 of the function file
from function_file_news_corpus import *

In [None]:
## calling function stop places 
## stopplaces returns a list of user defined place junk to be eliminated from the start of articles
stop_places = stopplaces()

## calling function stop words 
## stopwords returns a list of NLTK stopwords + user defined stopwords

stop_words = stopwords()

## hindu_df.pkl is the webscraped file in pkl format.
## hindu_df is a data frame with columns NewsBody, DateTime, Heading
## these columns are foreced to be type cast to string format below
## We then drop dupliacated if they exist

news_df = pd.read_pickle('hindu_df.pkl')

news_df.NewsBody = news_df.NewsBody.astype(str)
news_df.DateTime = news_df.DateTime.astype(str)
news_df.Heading = news_df.Heading.astype(str)
news_df.drop_duplicates(inplace = True)
print(len(news_df))


In [363]:

## Next, we apply the function clean_corpus_1.
## refer to the function file line 57
## Rename NewsBody column as text
## Clean other columns using the function clean_non_news_body_cols
## refer to function file line 452


news_df['NewsBody'] = news_df['NewsBody'].apply(clean_corpus_1)
news_df = news_df[news_df['NewsBody']!= 0]
news_df.rename(columns = {'NewsBody':'text'}, inplace = True)
news_df['original_text'] = news_df['text']
news_df['DateTime'] = news_df['DateTime'].apply(clean_non_news_body_cols)
news_df['Heading'] = news_df['Heading'].apply(clean_non_news_body_cols)


## Next, we apply the function clean_corpus_2.
## eliminates dangling places ( using stopplaces )
## add a column of document no as doct-no 

news_df['text'] = news_df['text'].apply(clean_corpus_2,stop_places = stop_places )
news_df['doct_no'] = range(len(news_df))
news_df['doct_no'] = news_df['doct_no'] +1
news_df.head()


## Next, we apply the function clean_corpus_3.
## eliminates stopwords ( using stopwords )
## lemmatizes words using POS tagging

news_df['text'] = news_df.text.astype(str)
news_df['text'] = news_df['text'].apply(clean_corpus_3,stop_words = stop_words)


## Next, we further use the function clean_corpus_4.
## convert high frequency adverbs / adjectives in text to their root verb / noun 
news_df['text'] = news_df['text'].apply(clean_corpus_4)



In [None]:
## We then read text specific bigrams and trigrams  
## We eliminate unigrams that co occur as birgrams and trigrams  
## Replace them with the  bigrams and trigrams in function clean_corpus_5_replace_unigrams_with_bi_grams

bi_gram_lis = pd.read_excel('bigrams_1.xlsx').Bigrams.tolist()
tri_gram_lis = pd.read_excel('trigrams_1.xlsx').Trigrams.tolist()

news_df['text'] = news_df['text'].apply(clean_corpus_5_replace_unigrams_with_bi_grams, 
                                            bi_gram_lis = bi_gram_lis, tri_gram_lis = tri_gram_lis)

bi_gram_lis = pd.read_excel('bigrams_2.xlsx').Bigrams.tolist()
tri_gram_lis = pd.read_excel('trigrams_2.xlsx').Trigrams.tolist()

news_df['text'] = news_df['text'].apply(clean_corpus_5_replace_unigrams_with_bi_grams, 
                                            bi_gram_lis = bi_gram_lis, tri_gram_lis = tri_gram_lis)


In [None]:
## Check for the token  count ' aadhaar' in each article
## Do not use birgrmas or unigrams
## simply apply functin tokenizer_tf_idf (which seprates words on spaces)
## eliminate articles where aadhaar is not used as aadhaar but for some other reference 
## Keep the remaining columns

news_df['new_text'] = news_df['text'].map(clean_check)
news_df['new_tokens'] = news_df['new_text'].map(tokenizer_tf_idf)
news_df['aadhaar_count'] = news_df['new_tokens'].apply(count_token, token = 'aadhaar')
len(news_df[news_df['aadhaar_count']== 1])
news_df['aadhaar_count'].hist(bins=100)

news_df = count_most_common(news_df,'new_tokens', 1)

def col_to_str (top_count_col):
    for word in top_count_col:
        return word
    
news_df['new_col'] = news_df['top_1_counts'].apply(col_to_str )

d_stop =pd.read_excel('hindu_non_relevant_entries.xlsx')
d_stop.entires = d_stop.entires.astype(str)

news_df['compare'] =  news_df['new_col'].apply(remove_more_junk_hindu_posts,compare_list = d_stop.entires.tolist() )
print(len(news_df[news_df['compare']==0]))
print('original', len(news_df))
news_df = news_df[news_df['compare']==1]
print('final', len(news_df))

news_df.drop(['new_text', 'new_tokens' , 'top_1_counts', 'new_col','compare' ], axis = 1, inplace = True)
news_df.head()


news_df.to_pickle('final_clean_hindu_corpus.pkl')


In [None]:
## generate TF_IDF
## note all relevant trigrams and bigrams have been captured
## tfidf inputs are - data frame, the text column, minimum_cut_off ( all words in documents fewer than 
## minimum cutoff will not be considered), n_gram set (min_n_gram, max_ngram)

tfidf = TF_IDF(news_df,'text', 30, (1,1))
writer_orig = pd.ExcelWriter('hindu_tfidf_011.xlsx')
tfidf.to_excel(writer_orig, sheet_name='hindu_tfidf_010')
writer_orig.save()


In [None]:
## Read the cleansed file
## remove more duplicates using the function clear_more_duplicates
## use the function remove_more_stopwords to remove even more stop words at this stage having generated TFIDF
## convert the date time into a readable format / usaable format using the function date_time_hindu
## save file in pkl format

news_df = pd.read_pickle('final_clean_hindu_corpus.pkl')
news_df = clear_more_duplicates(news_df, 'text')
news_df['text'] = news_df['text'].map(remove_more_stopwords)
news_df = date_time_hindu(news_df, 'DateTime')

news_df.to_pickle('final_clean_hindu_corpus.pkl')


In [None]:
news_df = pd.read_pickle('final_clean_hindu_corpus.pkl')
news_df.drop(['DateTime'], axis = 1, inplace = True)
