# Preprocessing data | recovery-news-data.csv

*CS 539 - Social Media Mining | Francesca Spezzano*

*Computer Science | Boise State University*

*11.22.2022 | Fall 2022*

*Aida Gomezbueno Berezo | aidagomezbuenobe@u.boisestate.edu*

Launching notebook with the following command:

*jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e1000000*

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
import time
import datetime
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
import nltk
from nltk.corpus import names, stopwords, words
from nltk.stem import *
import num2words

In [2]:
data = pd.read_csv(r'recovery-news-data.csv')
df = pd.DataFrame(data)
df.columns = ['index', 'news_id', 'url', 'publisher', 'publish_date', 'author', 'title', 'image', 'body_text', 'political_bias', 'country', 'reliability']
df = df.drop('index', axis=1)
df.head()

Unnamed: 0,news_id,url,publisher,publish_date,author,title,image,body_text,political_bias,country,reliability
0,0,https://www.nytimes.com/article/what-is-corona...,The New York Times,2020-01-21,"['Knvul Sheikh', 'Roni Caryn Rabin']",The Coronavirus: What Scientists Have Learned ...,https://static01.nyt.com/images/2020/03/12/sci...,\nA novel respiratory virus that originated in...,Left,USA,1
1,1,https://www.npr.org/2020/01/22/798392172/chine...,National Public Radio (NPR),2020-01-22,['Emily Feng'],Chinese Health Officials: More Die From Newly ...,https://media.npr.org/include/images/facebook-...,Chinese Health Officials: More Die From Newly ...,Center,USA,1
2,2,https://www.theverge.com/2020/1/23/21078457/co...,The Verge,2020-01-23,['Nicole Wetsman'],Everything you need to know about the coronavirus,https://cdn.vox-cdn.com/thumbor/a9_Oz7cvSBKyal...,Public health experts around the globe are scr...,Left-center,USA,1
3,3,https://www.worldhealth.net/news/novel-coronav...,WorldHealth.Net,2020-01-24,[],Novel Coronavirus Cases Confirmed To Be Spreading,https://www.worldhealth.net/media/original_ima...,The first two coronavirus cases in Europe have...,,USA,0
4,4,https://www.theverge.com/2020/1/24/21080845/co...,The Verge,2020-01-24,"['Nicole Wetsman', 'Zoe Schiffer', 'Jay Peters...",Coronavirus disrupts the world: updates on the...,https://cdn.vox-cdn.com/thumbor/t2gt1SmEni4Mcr...,"A new coronavirus appeared in Wuhan, China, at...",Left-center,USA,1


In [3]:
#CLEANING URLS
clean_subs = ['https://', 'http://', 'www.']
substring = "/"
urls=[]
for i in df['url']:
    #print(i)
    for j in clean_subs:
        if j in i:
            i = i.split(j)
            i = i[1]
    if substring in i:
        i = i.split(substring)
        i = i[0]
    urls.append(i)
df['url']=urls

df['publish_date'] = df['publish_date'].fillna("1970-11-01")
epoch = datetime.utcfromtimestamp(0)
for i in df['publish_date']:
    date_object = datetime.strptime(str(i), '%Y-%m-%d').date()
    delta = date_object - epoch.date()
    dt = datetime.fromtimestamp(int(delta.total_seconds())).strftime('%Y-%m-%d')
    mth = datetime.fromtimestamp(int(delta.total_seconds())).strftime('%B')
    if mth=="October":
        mth = "UNKNOWN"
    df['publish_date'] = df['publish_date'].replace(i, mth)
df['author'] = df['author'].fillna("")
replace_simb = ['[', ']', "'", ' etc.', "‘"]
coma = ", "
array_col = []
for i in df['author']:
    temp = []
    for j in replace_simb:
        if j in i:
            i = i.replace(j, "")
            
df['political_bias'] = df['political_bias'].fillna("NEUTRAL")
df['image'] = df['image'].fillna("")
df['title'] = df['title'].fillna("")
df['country'] = df['country'].fillna("UNKNOWN")

df['alltext'] = df['publisher'] + " " + df['author'] + " " +df['political_bias'] + " " + df['title'] + " " + df['body_text'] 

df.head()

Unnamed: 0,news_id,url,publisher,publish_date,author,title,image,body_text,political_bias,country,reliability,alltext
0,0,nytimes.com,The New York Times,January,"['Knvul Sheikh', 'Roni Caryn Rabin']",The Coronavirus: What Scientists Have Learned ...,https://static01.nyt.com/images/2020/03/12/sci...,\nA novel respiratory virus that originated in...,Left,USA,1,"The New York Times ['Knvul Sheikh', 'Roni Cary..."
1,1,npr.org,National Public Radio (NPR),January,['Emily Feng'],Chinese Health Officials: More Die From Newly ...,https://media.npr.org/include/images/facebook-...,Chinese Health Officials: More Die From Newly ...,Center,USA,1,National Public Radio (NPR) ['Emily Feng'] Cen...
2,2,theverge.com,The Verge,January,['Nicole Wetsman'],Everything you need to know about the coronavirus,https://cdn.vox-cdn.com/thumbor/a9_Oz7cvSBKyal...,Public health experts around the globe are scr...,Left-center,USA,1,The Verge ['Nicole Wetsman'] Left-center Every...
3,3,worldhealth.net,WorldHealth.Net,January,[],Novel Coronavirus Cases Confirmed To Be Spreading,https://www.worldhealth.net/media/original_ima...,The first two coronavirus cases in Europe have...,NEUTRAL,USA,0,WorldHealth.Net [] NEUTRAL Novel Coronavirus C...
4,4,theverge.com,The Verge,January,"['Nicole Wetsman', 'Zoe Schiffer', 'Jay Peters...",Coronavirus disrupts the world: updates on the...,https://cdn.vox-cdn.com/thumbor/t2gt1SmEni4Mcr...,"A new coronavirus appeared in Wuhan, China, at...",Left-center,USA,1,"The Verge ['Nicole Wetsman', 'Zoe Schiffer', '..."


#### Basic Steps

    1. Lowercase. 
    2. Stop words.
    3. Punctuation.
    4. Apostophre.
    5. Single characters.

In [4]:
df['alltext'] = df['alltext'].str.lower()

In [5]:
stop_words = nltk.corpus.stopwords.words('english')
all_text = []
valid_alltext=[]
for i in df['alltext']:
    i = i.split()
    all_text.append(i)
df['alltext']=all_text
for i in df['alltext']:
    valid_text=[]
    for j in i:
        if j not in stop_words:
            valid_text.append(j)
    valid_alltext.append(valid_text)
df['alltext']=valid_alltext

In [6]:
symbols = "!\"“#$%&()*+-./:;<,=>?@[\]^_`{|}~\n"
valid_allt=[]
for i in df['alltext']:
    valid_t = []
    for j in i:
        for k in symbols:
            if k in j:
                j = j.replace(k, "")
        valid_t.append(j)
    if [] in valid_t:
        valid_t.remove([])
    valid_allt.append(valid_t)
df['alltext']=valid_allt

In [7]:
valid_alltext=[]
for i in df['alltext']:
    valid_text=[]
    for j in i:
        if j not in stop_words:
            if "'" in j:
                j=j.replace("'", "")
            valid_text.append(j)
    valid_alltext.append(valid_text)
df['alltext']=valid_alltext

In [8]:
valid_allt=[]
for i in df['alltext']:
    valid_t = []
    for j in i:
        if len(j)>1:
            valid_t.append(j)
    valid_allt.append(valid_t)
df['alltext']=valid_allt

In [9]:
#NOMBRES PROPIOS CON PUNTOS.
#Get names w/ dots
domain_dot=[]
for i in df['publisher']:
    if "." in i:
        domain_dot.append(str(i).lower())
domain_dot = np.unique(domain_dot)
#Rename those who aren't w/ dot but has to be with dot
valid_allt=[]
for i in df['alltext']:
    valid_t=[]
    for j in i:
        for k in domain_dot:
            if "." in k:
                k_wodot = k.replace(".", "")
                if k_wodot in j:
                    j = j.replace(j, k)
        valid_t.append(j)
    valid_allt.append(valid_t)
df['alltext']=valid_allt    

In [10]:
#THE ORDER IS LEMMATIZATION THEN STEMMING, OR JUST STEMMING.
wnl = WordNetLemmatizer()
valid_allt=[]
for i in df['alltext']:
    valid_t=[]
    for j in i:
        j = wnl.lemmatize(j)
        valid_t.append(j)
    valid_allt.append(valid_t) 
df['alltext']=valid_allt

In [11]:
#STEMMING
stemmer = PorterStemmer()
valid_allt=[]
for i in df['alltext']:
    valid_t=[]
    for j in i:
        j = stemmer.stem(j)
        valid_t.append(j)
    valid_allt.append(valid_t) 
df['alltext']=valid_allt
df.to_csv('preprocessing-numbers.csv', index=False)

In [44]:
#converting numbers
def num_conversion(j):
    k, l = j.split()
    k = float(k)
    l = float(l)
    j = str(k*l)
    return j

valid_allt=[]
for i in df['alltext']:
    valid_t = []
    for j in i:
        if j.isnumeric():
            print(j)
            if "½" in j:
                j = j.replace("½", " 0.5")
                j = num_conversion(j)
            if "¼" in j:
                print(j)
                j = j.replace("¼", " 0.25")
                j = num_conversion(j)
            if "⅔" in j:
                j = j.replace("⅔", " 0.67")
                j = num_conversion(j)
            if "¾" in j:
                j = j.replace("¾", " 0.75")
                j = num_conversion(j)
            if "⅓" in j:
                j = j.replace("⅓", " 0.33")
                j = num_conversion(j)
            j = num2words.num2words(float(j))
        valid_t.append(j)          
    valid_allt.append(valid_t)
df['alltext']=valid_allt
df.to_csv('preprocessing-no_numbers.csv', index=False)

In [None]:
df