# 509 Final Project

## Globally import libraries

In [None]:
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import datetime as dt
import emoji
import itertools
import json
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pymysql as mysql
import random
import re
import regex as rex
import requests
import shutil
from string import punctuation
import time
from tqdm import tqdm
import zipfile

import nltk
from nltk.corpus import stopwords
import spacy

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

import textacy.preprocessing as tprep

# Set pandas global options
pd.options.display.max_rows = 17

%matplotlib inline

## Upload data from CSV

In [None]:
'''Dir nav citation:
https://softhints.com/python-change-directory-parent/'''
curr_dir = os.path.abspath(os.curdir)
print(curr_dir)
os.chdir("..")
up1_dir = os.path.abspath(os.curdir)
print(up1_dir)

In [None]:
# change `data_location` to the location of the folder on your machine.
data_location = 'data'

file_in_name01 = 'master.csv'

file_in_path01 = os.path.join(up1_dir, data_location, file_in_name01)

print(f'CSV file 1 in path: {file_in_path01}')

### Review dataframe

In [None]:
slct_tbl_full_df01 = pd.read_csv(file_in_path01)
print(f'Dataframe shape: {slct_tbl_full_df01.shape}')
display(slct_tbl_full_df01.head())

## Exploratory Data Analysis (EDA)

### Count missing `article_text` feature

In [None]:
count_nan = slct_tbl_full_df01.isnull().sum()
 
# printing the number of values present
# in the column
print('Number of NaN values present: ' + str(count_nan))

### Count blank `article_text` feature

In [None]:
print(len(slct_tbl_full_df01[slct_tbl_full_df01['article_text']=='']))
display(slct_tbl_full_df01[slct_tbl_full_df01['article_text']==''].head(20))

### Remove missing `article_text` rows

In [None]:
'''Drop missing citation:
https://pandas.pydata.org/pandas-docs/stable/reference
/api/pandas.DataFrame.dropna.html#pandas.DataFrame.dropna'''
slct_tbl_full_df02 = slct_tbl_full_df01.dropna(subset=['article_text'])
print(f'Dataframe shape: {slct_tbl_full_df02.shape}')
display(slct_tbl_full_df02.head())

In [None]:
tqdm.pandas(ncols=50)  # can use tqdm_gui, optional kwargs, etc
# Now you can use `progress_apply` instead of `apply`

# Raw text character and word counts
slct_tbl_full_df02['char_cnt'] = slct_tbl_full_df02['article_text'].progress_apply(len)
slct_tbl_full_df02['word_cnt'] = slct_tbl_full_df02['article_text'].progress_apply(lambda x: len(x.split()))
display(slct_tbl_full_df02.head())

### Descriptive statistics

In [None]:
slct_tbl_full_df02[['source_name',
                    'author',
                    'publish_date',
                    'article_text']].describe(include="O").T

In [None]:
slct_tbl_full_df02.describe().T

In [None]:
slct_tbl_full_df02['source_name'].value_counts()

### Examine inclusion of "centrist" sources indicated by `author` feature

In [None]:
slct_tbl_full_df02a = slct_tbl_full_df02[slct_tbl_full_df02['author'].isin(['msn', 'Associated Press', 'Reuters'])]

display(slct_tbl_full_df02a[slct_tbl_full_df02a['author']=='msn'])

display(slct_tbl_full_df02a.groupby(by=['source_name', 'author']).count())

In [None]:
counter = Counter(slct_tbl_full_df02['author'])

word_cutoff = 5
con_feature_words = set()

for word, count in counter.items():
    if count > word_cutoff:
        con_feature_words.add(word)
        
print(f"With a word cutoff of {word_cutoff}, we have {len(con_feature_words)} words as features in the model.")
print(con_feature_words)

In [None]:
slct_tbl_full_df03 = slct_tbl_full_df02[~slct_tbl_full_df02['author'].isin(['msn', 'Associated Press', 'Reuters'])]
slct_tbl_full_df03 = slct_tbl_full_df03.reset_index()
slct_tbl_full_df03['political_lean'] = 'right'
print(slct_tbl_full_df03.shape)
display(slct_tbl_full_df03.head())
slct_tbl_full_df03.loc[(slct_tbl_full_df03['source_name'] == 'The Washington Post') | (slct_tbl_full_df03['source_name'] == 'CNN'), 'political_lean'] = 'left'

display(slct_tbl_full_df03.head())

display(slct_tbl_full_df03['political_lean'].value_counts())

In [None]:
slct_tbl_full_df03['political_lean'].value_counts().plot(kind="bar",
                                                         legend=True,
                                                         figsize=(5,6),
                                                         title='Class distribution')

### Plot word counts

In [None]:
slct_tbl_full_df03.groupby('source_name')['word_cnt'].plot(kind="hist",
                                                           density=True,
                                                           alpha=0.5,
                                                           legend=True,
                                                           figsize=(15,9),
                                                           title='Histogram of Word Count Frequencies',
                                                           xlim=(0,6000))

## Data preprocessing

In [None]:
def uniq_tok(df_col=None):
    df_cols1 = pd.Series(df_col)

    all_tokens_lst01 = []

    [all_tokens_lst01.append(f) for f in df_cols1]
    all_tokens_lst01 = list(itertools.chain.from_iterable(all_tokens_lst01))
    all_tokens_set01 = set(all_tokens_lst01)
    print(len(sorted(all_tokens_set01)))
    print(sorted(all_tokens_set01))

In [None]:
slct_tbl_full_df04 = slct_tbl_full_df03.copy()

### Case-loading

In [None]:
slct_tbl_full_df03['lower'] = slct_tbl_full_df03['article_text'].apply(str.lower)

print(slct_tbl_full_df03.shape)
display(slct_tbl_full_df03.head())

### Text normalization

In [None]:
def normalize(text):
    text = tprep.normalize.hyphenated_words(text)
    text = tprep.normalize.quotation_marks(text)
    text = tprep.normalize.unicode(text)
    text = tprep.remove.accents(text)
    return text

slct_tbl_full_df03['norm'] = slct_tbl_full_df03['lower'].apply(normalize)

print(slct_tbl_full_df03.shape)
display(slct_tbl_full_df03.head())

for c in range(0,5):
    try:
        print(slct_tbl_full_df03['norm'][c], '\n')
    except:
        print(f'Skip {c}')

### Remove special characters

In [None]:
'''Complex citation (add lambda):
https://chat.openai.com/share/a135754c-c38c-47ea-8f83-54d41d5397ab'''
rex_u_code = rex.compile(r'\\u20[\w]{2}')
slct_tbl_full_df03['replace'] = slct_tbl_full_df03['norm'].apply(lambda x: x.replace('&nbsp;', ' ').replace(r'\n', ' ').replace('\u2063', ' ').replace('\u2066', ' ').replace('\u2069', ' ').replace('\u200b', ' ').replace('\u200d', ' '))
#slct_tbl_full_df03['replace_a'] = slct_tbl_full_df03['norm'].apply(lambda x: x.replace('&nbsp;', ' '))
#slct_tbl_full_df03['replace'] = slct_tbl_full_df03['replace_a'].apply(lambda x: rex_u_code.sub(' ', x))
#re.sub(pattern, replacement, string)

### RegEx find

In [None]:
rex_url_c = rex.compile(r'http[s]?:[\/]+[\S]*\s')
#rex_url_c = rex.compile(r'\n')
'''re.sub lambda citation:
https://chat.openai.com/share/402ec66e-2802-4cda-af8c-6f9f5b097d85'''
# Add leading and trailing space to URLs
def rex_url(text):
    text = rex_url_c.sub(lambda match: ' ' + match.group(0) + ' ', text)
    return text
    
slct_tbl_full_df03['rex_urls'] = slct_tbl_full_df03['replace'].apply(rex_url)

print(slct_tbl_full_df03.shape)
display(slct_tbl_full_df03.head())

### Split emojis

In [None]:
def emoji_split(text):
    return("".join([' ' + ch + ' ' if emoji.is_emoji(ch) else ch for ch in text]))

#lambda x: x.replace(x, ' ' + x + ' ') if emoji.is_emoji(x) else x

slct_tbl_full_df03['emoji_split'] = slct_tbl_full_df03['rex_urls'].apply(emoji_split)

print(slct_tbl_full_df03.shape)
display(slct_tbl_full_df03.head())

for c in range(0,5):
    try:
        print(slct_tbl_full_df03['emoji_split'][c], '\n')
    except:
        print(f'Skip {c}')

#### Display globally unqiue emojis

uniq_tok(df_col=slct_tbl_full_df03['emoji_lst'])

### Lemmatization using spaCY

nlp_trans01 = spacy.load('en_core_web_sm')

def lemma(text):
    trans_txt = nlp_trans01(text)
    return [t.lemma_ for t in trans_txt]

slct_tbl_full_df03['lemma'] = slct_tbl_full_df03['replace'].progress_apply(lemma)

print(slct_tbl_full_df03.shape)
display(slct_tbl_full_df03.head())

for c in range(0,5):
    try:
        print(slct_tbl_full_df03['lemma'][c], '\n')
    except:
        print(f'Skip {c}')

#### Display globally unqiue tokens on lemmas

uniq_tok(df_col=slct_tbl_full_df03['lemma'])

### Split

In [None]:
slct_tbl_full_df03['split'] = slct_tbl_full_df03['emoji_split'].apply(str.split)

print(slct_tbl_full_df03.shape)
display(slct_tbl_full_df03.head())

for c in range(0,100):
    try:
        print(slct_tbl_full_df03['split'][c], '\n')
    except:
        print(f'Skip {c}')

#### Display globablly unqiue tokens on first split

In [None]:
uniq_tok(df_col=slct_tbl_full_df03['split'])

### Remove stop words

In [None]:
sw = stopwords.words("english")

# Add additional stop words
sw.extend(['',
           '️',
           'arent',
           'cannot',
           'cant',
           'couldnt',
           'couldve',
           'didnt',
           'doesnt',
           'dont',
           'hadnt',
           'hasnt',
           'havent',
           'hes',
           'im',
           "i'm",
           'isnt',
           'it’s',
           'ive',
           '𝚘𝚏',
           'mightnt',
           'mustnt',
           'neednt',
           'shant',
           'shes',
           'shouldnt',
           'shouldve',
           'thatll',
           'theyll',
           'theyve',
           'wasnt',
           'werent',
           'whats',
           'weve',
           'wont',
           'wouldnt',
           'wouldve',
           'yall',
           'youd',
           'youll',
           'youre',
           'youve',
           "we'll",
           "you’re",
           "you’ve",
           "you’ll",
           "you’d",
           "she’s",
           "it’s",
           "that’ll",
           "don’t",
           "should’ve",
           "aren’t",
           "couldn’t",
           "didn’t",
           "doesn’t",
           "hadn’t",
           "hasn’t",
           "haven’t",
           "isn’t",
           "mightn’t",
           "mustn’t",
           "needn’t",
           "shan’t",
           "shouldn’t",
           "wasn’t",
           "weren’t",
           "won’t",
           "wouldn’t",
           "i’m",
           "we’ll",
           'said',
           'told',
           'according',
           'fox',
           'news',
           'cnn',
           'breitbart',
           'reuters'])

print(sw)

def sw_remover(tokens):
    return [t for t in tokens if t.lower() not in sw]

slct_tbl_full_df03['no_sw'] = slct_tbl_full_df03['split'].apply(sw_remover)

print(slct_tbl_full_df03.shape)
display(slct_tbl_full_df03.head())

for c in range(0,2):
    print(slct_tbl_full_df03['no_sw'][c])

#### Display no stop words

In [None]:
uniq_tok(df_col=slct_tbl_full_df03['no_sw'])

### Rejoin semi-processed tokens

In [None]:
slct_tbl_full_df03['no_sw_join'] = slct_tbl_full_df03['no_sw'].apply(" ".join)

print(slct_tbl_full_df03.shape)
display(slct_tbl_full_df03.head())

for c in range(0,2):
    print(slct_tbl_full_df03['no_sw_join'][c])

### Remove punctuation

In [None]:
punctuation = set(punctuation) # speeds up comparison
print(punctuation)

# Add special hyphen mark
tw_punct = punctuation - {"#"}
print(tw_punct)
# Remove hash and at symbols for later capture of hashtag info
tw_punct = tw_punct - {"@"}
tw_punct = tw_punct - {"-"}
tw_punct = tw_punct - {"/"}
tw_punct.add("’")
tw_punct.add("‘")
tw_punct.add("”")
tw_punct.add("“")
tw_punct.add("…")
tw_punct.add("—")
tw_punct.add("...")
tw_punct.add("€")
tw_punct.add("±")
tw_punct.add("£")
tw_punct.add("¡")
tw_punct.add("§")
tw_punct.add("⦿")

print(tw_punct)

In [None]:
def remove_punctuation(text, punct_set=punctuation): 
    return("".join([ch for ch in text if ch not in punct_set]))

slct_tbl_full_df03['no_sw_join_no_punc'] = slct_tbl_full_df03['no_sw_join'].apply(remove_punctuation, punct_set=tw_punct)

print(slct_tbl_full_df03.shape)
display(slct_tbl_full_df03.head())

for c in range(0,10):
    try:
        print(slct_tbl_full_df03['no_sw_join_no_punc'][c], '\n')
    except:
        print(f'\nerror on {c}\n')

### Tokenize

In [None]:
slct_tbl_full_df03['no_sw_join_no_punc_tok'] = slct_tbl_full_df03['no_sw_join_no_punc'].apply(str.split)

print(slct_tbl_full_df03.shape)
display(slct_tbl_full_df03.head())

for c in range(0,2):
    print(slct_tbl_full_df03['no_sw_join_no_punc_tok'][c], '\n')

#### Display globally unqiue tokens on final tokens

In [None]:
uniq_tok(df_col=slct_tbl_full_df03['no_sw_join_no_punc_tok'])

### Pipeline consolidation

In [None]:
def prepare(text, pipeline):
    '''Run a pipeline of text processing transformers'''
    tokens = str(text)
    
    # Pull key and val from trans dictionaries
    for transformer in pipeline:
        trans = list(transformer.keys())[0]
        args = list(transformer.values())[0]
        #print(trans)
        #print(args)
        if args == None:
            #print(1)
            tokens = trans(tokens)
        else:
            #print('check99', trans, args)
            tokens = trans(tokens, args) 

    return(tokens)

'''Set transformer pipeline 1:
Caseloading, normalization (using textacy), special ch removal,
split on whitespace, stop word removal, rejoin,
remove custom punctuation, tokenize
'''
transformers01 = [{str.lower: None},
                  {normalize: None},
                  {(lambda x: x.replace('&nbsp;', ' ').replace(r'\\n', ' ').replace('\u2063', ' ').replace('\u2066', ' ').replace('\u2069', ' ').replace('\u200b', ' ').replace('\u200d', ' ')): None},
                  {rex_url: None},
                  {emoji_split: None},
                  {str.split: None},
                  {sw_remover: None},
                  {" ".join: None},
                  {remove_punctuation: tw_punct},
                  {str.split: None},
                  #{" ".join: None},
                 ]

# Apply transformers to pandas dataframe, w/ new col containing tokens
slct_tbl_full_df04['processed_text'] = slct_tbl_full_df04['article_text'].progress_apply(prepare,
                                                                                 pipeline=transformers01)
slct_tbl_full_df04['num_tokens'] = slct_tbl_full_df04['processed_text'].map(len)

display(slct_tbl_full_df04.head())

# Review unique tokens across entire dataset
for c in range(0,5):
    try:
        print(slct_tbl_full_df04['processed_text'][c], '\n')
    except:
        print(f'Skip {c}')

### Calculate concentration ratio of each set of corpora

In [None]:
display(slct_tbl_full_df04['political_lean'].value_counts())

slct_tbl_full_df04_left = slct_tbl_full_df04[slct_tbl_full_df04['political_lean'] == 'left']

print(slct_tbl_full_df04_left.shape)
#display(slct_tbl_full_df04_left.head())

slct_tbl_full_df04_right = slct_tbl_full_df04[slct_tbl_full_df04['political_lean'] == 'right']

print(slct_tbl_full_df04_right.shape)
#display(slct_tbl_full_df04_right.head())

slct_tbl_full_df04_left_s1 = list(itertools.chain.from_iterable(list(pd.Series(slct_tbl_full_df04_left['processed_text']))))
print(slct_tbl_full_df04_left_s1[:10])
slct_tbl_full_df04_right_s1 = list(itertools.chain.from_iterable(list(pd.Series(slct_tbl_full_df04_right['processed_text']))))
print(slct_tbl_full_df04_right_s1[:10])

In [None]:

def concen_ratio(artist_lst=[],
                 lsts=[]):
    lyr_corp_lst = []
    for l in lsts:
        print(type(l))
        lyr_corp_lst.append(' '.join(l))
    print(len(lyr_corp_lst))
    #print(lyr_corp_lst)

    cv = CountVectorizer(input='content',
                         encoding='utf-8',
                         stop_words=None,
                         token_pattern=r'\S+'
                        )

    lyr_tokens_fit = cv.fit(lyr_corp_lst)

    print(pd.Series(cv.get_feature_names_out()).sample(15))

    lyr_tokens_sm = cv.transform(lyr_corp_lst)
    display(lyr_tokens_sm)

    df = pd.DataFrame(lyr_tokens_sm.toarray(),
                      columns=cv.get_feature_names_out())
    #display(df)

    df02 = df.copy()
    df02['r_sum'] = df02.sum(axis=1)
    #display(df02)

    '''Filter by frequency for all columns citation:
    OpenAI. (2021). ChatGPT [Computer software]. https://openai.com/;
    https://pandas.pydata.org/pandas-docs/stable/reference/api/
    pandas.DataFrame.ge.html'''
    condition = df.ge(5).all()
    #print(condition)

    # Get the list of columns that satisfy the condition
    columns = condition[condition].index.tolist()
    #print(columns)
    columns.append('r_sum')
    #print(columns)

    #display(df02[columns])

    df03 = df02[columns].copy()
    display(df03)

    '''Filter by frequency for all columns & add summary row citation:
    OpenAI. (2021). ChatGPT [Computer software]. https://openai.com/'''
    df04 = df03.apply(lambda x: x / df03.iloc[:,-1], axis=0)
    #display(df04)

    # Create new rows by dividing one artist row by the second artists row
    new_row01 = df04.iloc[0] / df04.iloc[1]
    new_row02 = df04.iloc[1] / df04.iloc[0]

    # Append the new row to the DataFrame
    df04 = df04.append(new_row01, ignore_index=True)
    df04 = df04.append(new_row02, ignore_index=True)
    display(df04)

    # Transpose dataframe
    df05 = df04.T
    df05 = df05.reset_index()
    df05.columns = ['token',
                    'c1_concen',
                    'c2_concen',
                    'c1c2_concen_ratio',
                    'c2c1_concen_ratio']
    #print(df05)
    
    '''Sort values citation:
    https://pandas.pydata.org/pandas-docs/stable/reference/api
    /pandas.DataFrame.sort_values.html'''
    print(artist_lst[0])
    display(df05[['token',
                  'c1c2_concen_ratio']].sort_values(by='c1c2_concen_ratio',
                                                    ascending=False).head(10))
    print(artist_lst[1])
    display(df05[['token',
                  'c2c1_concen_ratio']].sort_values(by='c2c1_concen_ratio',
                                                    ascending=False).head(10))

concen_ratio(artist_lst=['Left-Right Concentration Ratio',
                         'Right-Left Concentration Ratio'],
             lsts=[slct_tbl_full_df04_left_s1,
                   slct_tbl_full_df04_right_s1])

#### Display globally unqiue tokens on final tokens

In [None]:
uniq_tok(df_col=slct_tbl_full_df04["processed_text"])

### 