# 599 Capstone Project

The notebook is for text data preprocessing.

## Globally import libraries and set display parameters

Libraries needed mostly pertain to dataframe manipulation for data preprocessing.

In [1]:
from collections import defaultdict, Counter
import datetime as dt
import emoji
from icecream import ic
from IPython.display import display_html 
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import random
import re
import regex as rex
import shutil
from string import punctuation
import time
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords

import spacy

from sklearn.feature_extraction.text import TfidfTransformer, \
CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline

import textacy.preprocessing as tprep
from textacy.extract import keyword_in_context

Set global parameters.

In [2]:
global_start_time = time.perf_counter()

random.seed(1699)

# Set pandas global options
pd.options.display.max_rows = 23
pd.options.display.precision = 4
np.set_printoptions(suppress=True,
                    precision=4)

%matplotlib inline

# Set tqdm package progress bar
tqdm.pandas(ncols=50)

dply_rng_end01 = 0
dply_rng_end02 = 2

to_csv_flag = True

## Upload data from CSV

Establish working directories for saving dataframes as CSV files.

In [3]:
'''Dir nav citation:
https://softhints.com/python-change-directory-parent/
'''
curr_dir = os.path.abspath(os.curdir)
ic(curr_dir)
os.chdir("..")
up1_dir = os.path.abspath(os.curdir)
ic(up1_dir)
ic()

ic| curr_dir: 'C:\\Users\\acarr\\Documents\\GitHub\\599_team_project\\deliverables'
ic| up1_dir: 'C:\\Users\\acarr\\Documents\\GitHub\\599_team_project'
ic| 652109981.py:9 in <module> at 13:11:04.701


Get current date/time to append to file name string.

In [4]:
today = dt.datetime.today()
today= str(today)
today = today.replace(':', '-').replace('.', '').replace(' ', '_')
ic(today)
ic(type(today))
ic()

ic| today: '2023-07-25_13-11-04731013'
ic| type(today): <class 'str'>
ic| 3990198718.py:6 in <module> at 13:11:04.802


Establish full file name path.

In [5]:
# change `data_location` to the location of the folder on your machine.
data_location = 'data'
ref_docs_location = 'ref_docs'

file_in_name01 = 'data_preprocessed_wo_sw_2023-07-20_13-02-01408354.csv'

file_in_path01 = os.path.join(up1_dir, data_location, file_in_name01)

print(f'CSV file in 1 path: {file_in_path01}')

CSV file in 1 path: C:\Users\acarr\Documents\GitHub\599_team_project\data\data_preprocessed_wo_sw_2023-07-20_13-02-01408354.csv


### Review dataframe

Read in data from CSV, check resulting dataframe shape, and display first several records.

In [6]:
slct_tbl_full_df01 = pd.read_csv(file_in_path01)
print(f'Dataframe shape: {slct_tbl_full_df01.shape}')
display(slct_tbl_full_df01.head())

Dataframe shape: (36405, 11)


Unnamed: 0,text_id,source_name,author,title,url,publish_date,article_text,content,processed_text,processed_text_split,num_tokens
0,2,USA Today,"USA TODAY, Emily DeLetter, USA TODAY","Tito's launches 'Tito's in a Big Can,' an empt...",https://www.usatoday.com/story/money/food/2023...,2023-06-21T17:37:40Z,Have you ever wanted to own your very own keg ...,Have you ever wanted to own your very own keg ...,ever wanted keg titos handmade vodka dream bec...,"['ever', 'wanted', 'keg', 'titos', 'handmade',...",117
1,3,USA Today,"USA TODAY, Joy Ashford, USA TODAY",Search for missing actor Julian Sands continue...,https://www.usatoday.com/story/entertainment/c...,2023-06-20T17:36:09Z,Over five months after Julian Sands went missi...,Over five months after Julian Sandswent missin...,five months julian sands went missing solo hik...,"['five', 'months', 'julian', 'sands', 'went', ...",215
2,5,USA Today,Tar Heels Wire,Four star running back picks Michigan State ov...,https://tarheelswire.usatoday.com/2023/06/24/f...,2023-06-25T03:51:10Z,\n\n\n\n\n\n\n\n\n\n\n\n\n\nFour star running ...,Mack Brown and the UNC football program have b...,four star running back picks michigan state un...,"['four', 'star', 'running', 'back', 'picks', '...",161
3,6,USA Today,Roll Tide Wire,Alabama center Charles Bediako signs one-year ...,https://rolltidewire.usatoday.com/2023/06/23/a...,2023-06-23T21:29:24Z,\n\n\n\n\n\n\n\n\n\n\n\n\n\nAlabama center Cha...,Alabama center Charles Bediako was signed to a...,alabama center charles bediako signs one year ...,"['alabama', 'center', 'charles', 'bediako', 's...",237
4,7,USA Today,Celtics Wire,Ralph Sampson breaks down iconic Boston Celtic...,https://celticswire.usatoday.com/2023/06/23/nb...,2023-06-23T11:00:41Z,\n\n\n\n\n\n\n\n\n\n\n\n\n\nRalph Sampson brea...,It was one of the most memorable moments in NB...,ralph sampson breaks iconic boston houston roc...,"['ralph', 'sampson', 'breaks', 'iconic', 'bost...",134


## Initial (prior to preprocessing) Exploratory Data Analysis (EDA)

Perform only the EDA steps needed to preprocess the data, e.g., checking for missing values.

### Count missing `article_text` feature

The majority of null values appear in the `author` column. There are also two in `article_text`. The `author` feature will not be used for current modeling efforts, therefore it is not a factor. The instances with missing article text will be removed.

In [7]:
na_cnt = slct_tbl_full_df01.isnull().sum()
 
# Display missing values
print('Missing value counts:\n' + na_cnt.to_string())

Missing value counts:
text_id                   0
source_name               0
author                  209
title                     0
url                       0
publish_date              0
article_text              0
content                   0
processed_text            3
processed_text_split      0
num_tokens                0


### Count blank `article_text` feature

Show instances with blank or missing `article_text` values; *n* = 2.

In [8]:
ic(len(slct_tbl_full_df01[slct_tbl_full_df01['processed_text']==None]))
display(slct_tbl_full_df01[slct_tbl_full_df01['processed_text'].isna()].head(23))

ic| len(slct_tbl_full_df01[slct_tbl_full_df01['processed_text']==None]): 0


Unnamed: 0,text_id,source_name,author,title,url,publish_date,article_text,content,processed_text,processed_text_split,num_tokens
6140,7609,Forbes,"EY Contributor, EY, \n EY Contributor, EY\n ht...",5 Ways Consumer Companies Unlock Value From Su...,https://www.forbes.com/sites/ey/2023/06/22/5-w...,2023-06-22T17:08:16Z,\n\n\n\n\n\n\n\n,This publication contains information in summa...,,[],0
31421,33725,Forbes,"Fidelity Viewpoints Team, Fidelity, \n Fidelit...",Balancing Growth And Protection,https://www.forbes.com/sites/fidelity/2023/06/...,2023-06-09T16:24:08Z,\n\n\n\n\n\n,"Investing involves risk, including risk of los...",,[],0
34739,66724,The Washington Post,Philip Bump,The other revelation from Hunter Biden’s plea ...,https://www.washingtonpost.com/politics/2023/0...,2023-06-20T15:34:49Z,This article has been updated.,Comment on this story\r\nComment\r\nThose tuni...,,[],0


### Remove missing `article_text` row(s)

Use pandas' `dropna()` method to remove instances with null values; send results to a new dataframe. Print resulting df shape and head.

In [9]:
'''Drop missing citation:
https://pandas.pydata.org/pandas-docs/stable/reference
/api/pandas.DataFrame.dropna.html#pandas.DataFrame.dropna
'''
slct_tbl_full_df02 = slct_tbl_full_df01.dropna(subset=['processed_text'])
slct_tbl_full_df02 = slct_tbl_full_df02.reset_index(drop=True)

slct_tbl_full_df03 = slct_tbl_full_df02.copy()

print(f'Dataframe shape: {slct_tbl_full_df03.shape}')
display(slct_tbl_full_df03.head())

Dataframe shape: (36402, 11)


Unnamed: 0,text_id,source_name,author,title,url,publish_date,article_text,content,processed_text,processed_text_split,num_tokens
0,2,USA Today,"USA TODAY, Emily DeLetter, USA TODAY","Tito's launches 'Tito's in a Big Can,' an empt...",https://www.usatoday.com/story/money/food/2023...,2023-06-21T17:37:40Z,Have you ever wanted to own your very own keg ...,Have you ever wanted to own your very own keg ...,ever wanted keg titos handmade vodka dream bec...,"['ever', 'wanted', 'keg', 'titos', 'handmade',...",117
1,3,USA Today,"USA TODAY, Joy Ashford, USA TODAY",Search for missing actor Julian Sands continue...,https://www.usatoday.com/story/entertainment/c...,2023-06-20T17:36:09Z,Over five months after Julian Sands went missi...,Over five months after Julian Sandswent missin...,five months julian sands went missing solo hik...,"['five', 'months', 'julian', 'sands', 'went', ...",215
2,5,USA Today,Tar Heels Wire,Four star running back picks Michigan State ov...,https://tarheelswire.usatoday.com/2023/06/24/f...,2023-06-25T03:51:10Z,\n\n\n\n\n\n\n\n\n\n\n\n\n\nFour star running ...,Mack Brown and the UNC football program have b...,four star running back picks michigan state un...,"['four', 'star', 'running', 'back', 'picks', '...",161
3,6,USA Today,Roll Tide Wire,Alabama center Charles Bediako signs one-year ...,https://rolltidewire.usatoday.com/2023/06/23/a...,2023-06-23T21:29:24Z,\n\n\n\n\n\n\n\n\n\n\n\n\n\nAlabama center Cha...,Alabama center Charles Bediako was signed to a...,alabama center charles bediako signs one year ...,"['alabama', 'center', 'charles', 'bediako', 's...",237
4,7,USA Today,Celtics Wire,Ralph Sampson breaks down iconic Boston Celtic...,https://celticswire.usatoday.com/2023/06/23/nb...,2023-06-23T11:00:41Z,\n\n\n\n\n\n\n\n\n\n\n\n\n\nRalph Sampson brea...,It was one of the most memorable moments in NB...,ralph sampson breaks iconic boston houston roc...,"['ralph', 'sampson', 'breaks', 'iconic', 'bost...",134


### Display Source counts

Count the number of instances within each news source, e.g., ABC, CNN, USA Today.

In [10]:
source_cnt = slct_tbl_full_df03['source_name'].value_counts()
 
# Display missing values
print('Source counts:\n' + source_cnt.to_string())

source_cnt.to_csv(os.path.join(up1_dir,
                               ref_docs_location,
                               'source_dist.csv'))

Source counts:
Forbes                 7784
New York Post          6862
USA Today              6752
ABC News               4858
CNN                    1686
Business Insider       1563
Fox News               1379
CNBC                   1338
NBC News               1111
Breitbart News         1038
Buzzfeed                750
The Washington Post     373
PEOPLE                  245
MSNBC                   225
Vox                     213
Wired                   129
Reuters                  96


Set list of dataframe columns to export.

In [11]:
export_col_names_lst = ['processed_text',
                       ]

### Write file without stop words to CSV - data subset 1

Set path to write CSV file to.

In [12]:
file_out_name01 = f'data_preprocessed_wo_sw_X2_{today}.csv'
file_out_path01 = os.path.join(up1_dir, data_location, file_out_name01)
print(f'CSV file out 1 path: {file_out_path01}')

CSV file out 1 path: C:\Users\acarr\Documents\GitHub\599_team_project\data\data_preprocessed_wo_sw_X2_2023-07-25_13-11-04731013.csv


Write pandas dataframe to CSV; save locally.

In [13]:
len_half = round(len(export_col_names_lst)/2,0)
print(len_half)

if to_csv_flag == False:
    pass
else:
    slct_tbl_full_df03[export_col_names_lst][:len_half].to_csv(file_out_path01,
                                                    index=False)

0.0


TypeError: cannot do slice indexing on RangeIndex with these indexers [0.0] of type float

In [None]:
print(slct_tbl_full_df03.shape)

#### Display globally unique tokens

In [None]:
import ast
def uniq_tok(df_col=pd.DataFrame([''])):
    '''Display all unique tokens across all instances'''
    start_time = time.perf_counter()
    df_cols1 = pd.Series(df_col)
    #print(df_cols1)

    all_tokens_lst01 = []
    
    counter = 0
    #for f in df_cols1:
    #    if counter < 5:
            #print(ast.literal_eval(f))
            #print(type(ast.literal_eval(f)))
    #    counter += 1

    [all_tokens_lst01.append(ast.literal_eval(f)) for f in df_cols1]
    all_tokens_lst01 = list(itertools.chain\
                            .from_iterable(all_tokens_lst01))
    #print(all_tokens_lst01[:100])
    all_tokens_set01 = set(all_tokens_lst01)
    #print(all_tokens_set01)
    print(len(sorted(all_tokens_set01)))
    #print(sorted(all_tokens_set01))
    end_time = time.perf_counter()
    print(f'\nElapsed processing time = {end_time - start_time}')
    
    return sorted(all_tokens_set01)

df03_t = uniq_tok(df_col=slct_tbl_full_df03['processed_text_split'])

In [None]:
slct_tbl_full_df03['url'].iloc[36400]

In [None]:
slct_tbl_full_df03['url'].iloc[36401]

In [None]:
print(slct_tbl_full_df03.duplicated(subset=['processed_text'], keep='last').sum())
slct_tbl_full_df03a = slct_tbl_full_df03.loc[slct_tbl_full_df03.duplicated(subset=['processed_text'], keep='last') == True]
slct_tbl_full_df03a = slct_tbl_full_df03a.sort_values(by=['processed_text', 'url'])
#display(slct_tbl_full_df03a[['processed_text', 'url']].head())
for r in slct_tbl_full_df03a['url']:
    print(r)

In [None]:
ic(len(df03_t))
print(slct_tbl_full_df03['num_tokens'].sum())
#print(df03_t)

## Display runtime

In [None]:
global_end_time = time.perf_counter()

In [None]:
print(f'''\nElapsed processing time = {round((global_end_time 
- global_start_time)/60,2)} mins''')

## References