In [13]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from urllib.error import HTTPError
from urllib.error import URLError
from http.client import IncompleteRead
from urllib.request import urlopen
from bs4 import BeautifulSoup
import warnings 
import contractions
import nltk
import re
import spacy

In [390]:
df = pd.DataFrame(pd.read_excel("NewsData.xlsx"))
  
df

Unnamed: 0,Source,URL,Bias
0,AlterNet,https://www.alternet.org/2018/09/michael-moore...,-28.75
1,AlterNet,https://www.alternet.org/2019/02/conservative-...,-20.00
2,AlterNet,https://www.alternet.org/2019/03/mueller-just-...,-18.67
3,AlterNet,https://www.alternet.org/2019/03/trump-has-sol...,-26.25
4,AlterNet,https://www.alternet.org/2019/03/this-needs-to...,-27.00
...,...,...,...
1896,Daily Caller,https://dailycaller.com/2020/01/20/african-ame...,23.13
1897,Daily Caller,https://dailycaller.com/2019/04/17/mitch-mccon...,23.33
1898,Daily Caller,https://dailycaller.com/2019/03/19/guilfoyle-t...,25.00
1899,Daily Caller,https://dailycaller.com/2019/04/16/gop-senator...,25.33


### Get Text From URLs

In [391]:
def get_title_and_body(URL):
    # Get HTML
    html = urlopen(URL).read().decode("utf-8")
    htmlParse = BeautifulSoup(html, 'html.parser')
    # Get Title
    title = htmlParse.find("title").get_text()
    # Get Body
    body = ""
    for p in htmlParse.find_all("p"):
        text = p.get_text()   
        body += text  
    all_text = title + body
    return all_text

def get_all_text(df):
    contents = []    
    with tqdm(total=len(df)) as pbar:
        for i in range(len(df)):
            try:
                text = get_title_and_body(df.iloc[i,1])
                contents = np.append(contents, text)
            except HTTPError as err:
                contents = np.append(contents, None)
            except URLError as err:
                contents = np.append(contents, None)
            except IncompleteRead as err:
                contents=np.append(contents,None)
            except AttributeError as err:
                if str(err) != "'NoneType' object has no attribute 'get_text'":
                    text = get_title_and_body(df.iloc[i,1])
                    contents = np.append(contents, text)
                else:
                    contents = np.append(contents, None)
            pbar.update(1)
        pbar.close()
    df.loc[:,'cleaned_text'] = pd.Series(contents)  
    return df

# Calling the cleaning function
news = get_all_text(df)
print(len(news))

  0%|          | 0/1901 [00:00<?, ?it/s]

1901


In [406]:
news.rename(columns={"cleaned_text": "Text"}, inplace=True)
news.dropna(subset="Text",inplace=True)
news=news.reset_index(drop=True)
news

Unnamed: 0,Source,URL,Bias,Text
0,AlterNet,https://www.alternet.org/2018/09/michael-moore...,-28.75,michael moore detail fear trump may last presi...
1,AlterNet,https://www.alternet.org/2019/02/conservative-...,-20.00,conservative apologize ignore christian rightw...
2,AlterNet,https://www.alternet.org/2019/03/mueller-just-...,-18.67,mueller drop hint team could work something bi...
3,AlterNet,https://www.alternet.org/2019/03/trump-has-sol...,-26.25,trump sell farmer vote now race toward calamit...
4,AlterNet,https://www.alternet.org/2019/03/this-needs-to...,-27.00,need expose top intel democrat reveal kremlin ...
...,...,...,...,...
1893,Daily Caller,https://dailycaller.com/2020/01/20/african-ame...,23.13,"‘I’m Governor Ralph Northam, And I Am In Black..."
1894,Daily Caller,https://dailycaller.com/2019/04/17/mitch-mccon...,23.33,"Mitch McConnell Trolls Democrats, Merrick Garl..."
1895,Daily Caller,https://dailycaller.com/2019/03/19/guilfoyle-t...,25.00,GUILFOYLE: President Trump — The Only One In W...
1896,Daily Caller,https://dailycaller.com/2019/04/16/gop-senator...,25.33,GOP Senators Really Want To See A ‘Highly Clas...


### Clean Text

In [407]:
# Fix contractions
def fix_contractions(text):
    fixed_text = []
    for word in text:
        fixed_text.append(contractions.fix(word))
    return "".join(fixed_text)


# Lemmatize words
#spacy.cli.download('en_core_web_sm')
def lemmatize(words):
    nlp=spacy.load('en_core_web_sm',disable=['parser', 'ner'])
    #words = ' '.join([w for w in words])
    text = nlp(words)
    fixed_text = ' '.join([w.lemma_ for w in text])
    return fixed_text

# Remove stop words
#nltk.download('all', halt_on_error=False)
def remove_stopwords(words):
    stopword_list=nltk.corpus.stopwords.words('english')
    stopword_list.remove('no')
    stopword_list.remove('not')
    stopword_list.remove('nor')
    stopword_list.remove('against')
    stopword_list.remove('now')
    
    fixed_text = []
    for w in words.split():
        if w not in stopword_list:
            fixed_text.append(w)
    return " ".join(fixed_text)

def clean(df):
    with tqdm(total=len(df)) as pbar:
        for i in range(len(df["Text"])):
            
            df["Text"][i]=df["Text"][i].replace('\xa0'," ")
            df["Text"][i] = re.sub(r"(?!(?<=[a-z])'[a-z])[^\w\s]", '', df["Text"][i])
            df["Text"][i]=df["Text"][i].lower()
            df["Text"][i]=" ".join(df["Text"][i].strip().split())
            
            df["Text"][i]=contractions.fix(df["Text"][i])
            df["Text"][i]=remove_stopwords(df["Text"][i])
            df["Text"][i]=lemmatize(df["Text"][i])
            
            pbar.update(1)
        pbar.close()
    return df
    
warnings.filterwarnings('ignore')

copy = clean(news)

copy

  0%|          | 0/1898 [00:00<?, ?it/s]

Unnamed: 0,Source,URL,Bias,Text
0,AlterNet,https://www.alternet.org/2018/09/michael-moore...,-28.75,michael moore detail fear trump may last presi...
1,AlterNet,https://www.alternet.org/2019/02/conservative-...,-20.00,conservative apologize ignore christian rightw...
2,AlterNet,https://www.alternet.org/2019/03/mueller-just-...,-18.67,mueller drop hint team could work something bi...
3,AlterNet,https://www.alternet.org/2019/03/trump-has-sol...,-26.25,trump sell farmer vote now race toward calamit...
4,AlterNet,https://www.alternet.org/2019/03/this-needs-to...,-27.00,need expose top intel democrat reveal kremlin ...
...,...,...,...,...
1893,Daily Caller,https://dailycaller.com/2020/01/20/african-ame...,23.13,governor ralph northam blackface today african...
1894,Daily Caller,https://dailycaller.com/2019/04/17/mitch-mccon...,23.33,mitch mcconnell trolls democrats merrick garla...
1895,Daily Caller,https://dailycaller.com/2019/03/19/guilfoyle-t...,25.00,guilfoyle president trump one washington take ...
1896,Daily Caller,https://dailycaller.com/2019/04/16/gop-senator...,25.33,gop senator really want see highly classify fb...


In [408]:
copy.to_csv('CleanedNewsData.csv') 