# Preprocessing data | recovery-news-data.csv

*CS 539 - Social Media Mining | Francesca Spezzano*

*Computer Science | Boise State University*

*11.22.2022 | Fall 2022*

*Aida Gomezbueno Berezo | aidagomezbuenobe@u.boisestate.edu*

Launching notebook with the following command:

*jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e100000000000*

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
import time
import datetime
from datetime import datetime
#from sklearn.linear_model import LogisticRegression
#from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
import nltk
from nltk.corpus import names, stopwords, words
from nltk.stem import *
import num2words
import math

In [2]:
dat = pd.read_csv(r'recovery-news-data.csv')
df = pd.DataFrame(dat)
df.columns = ['index', 'news_id', 'url', 'publisher', 'publish_date', 'author', 'title', 'image', 'body_text', 'political_bias', 'country', 'reliability']
df = df.drop('index', axis=1)
df.head()

Unnamed: 0,news_id,url,publisher,publish_date,author,title,image,body_text,political_bias,country,reliability
0,0,https://www.nytimes.com/article/what-is-corona...,The New York Times,2020-01-21,"['Knvul Sheikh', 'Roni Caryn Rabin']",The Coronavirus: What Scientists Have Learned ...,https://static01.nyt.com/images/2020/03/12/sci...,\nA novel respiratory virus that originated in...,Left,USA,1
1,1,https://www.npr.org/2020/01/22/798392172/chine...,National Public Radio (NPR),2020-01-22,['Emily Feng'],Chinese Health Officials: More Die From Newly ...,https://media.npr.org/include/images/facebook-...,Chinese Health Officials: More Die From Newly ...,Center,USA,1
2,2,https://www.theverge.com/2020/1/23/21078457/co...,The Verge,2020-01-23,['Nicole Wetsman'],Everything you need to know about the coronavirus,https://cdn.vox-cdn.com/thumbor/a9_Oz7cvSBKyal...,Public health experts around the globe are scr...,Left-center,USA,1
3,3,https://www.worldhealth.net/news/novel-coronav...,WorldHealth.Net,2020-01-24,[],Novel Coronavirus Cases Confirmed To Be Spreading,https://www.worldhealth.net/media/original_ima...,The first two coronavirus cases in Europe have...,,USA,0
4,4,https://www.theverge.com/2020/1/24/21080845/co...,The Verge,2020-01-24,"['Nicole Wetsman', 'Zoe Schiffer', 'Jay Peters...",Coronavirus disrupts the world: updates on the...,https://cdn.vox-cdn.com/thumbor/t2gt1SmEni4Mcr...,"A new coronavirus appeared in Wuhan, China, at...",Left-center,USA,1


In [3]:
#CLEANING URLS
clean_subs = ['https://', 'http://', 'www.']
substring = "/"
urls=[]
for i in df['url']:
    for j in clean_subs:
        if j in i:
            i = i.split(j)
            i = i[1]
    if substring in i:
        i = i.split(substring)
        i = i[0]
    urls.append(i)
df['url']=urls

df['publish_date'] = df['publish_date'].fillna("1970-11-01")
epoch = datetime.utcfromtimestamp(0)
for i in df['publish_date']:
    date_object = datetime.strptime(str(i), '%Y-%m-%d').date()
    delta = date_object - epoch.date()
    dt = datetime.fromtimestamp(int(delta.total_seconds())).strftime('%Y-%m-%d')
    mth = datetime.fromtimestamp(int(delta.total_seconds())).strftime('%B')
    if mth=="October":
        mth = "UNKNOWN"
    df['publish_date'] = df['publish_date'].replace(i, mth)
df['author'] = df['author'].fillna("")
replace_simb = ['[', ']', "'", ' etc.', "‘"]
coma = ", "
array_col = []
for i in df['author']:
    temp = []
    for j in replace_simb:
        if j in i:
            i = i.replace(j, "")
            
df['political_bias'] = df['political_bias'].fillna("NEUTRAL")
df['image'] = df['image'].fillna("")
df['title'] = df['title'].fillna("")
df['country'] = df['country'].fillna("UNKNOWN")

df['alltext'] = df['publisher'] + " " + df['author'] + " " +df['political_bias'] + " " + df['title'] + " " + df['body_text'] 

df.head()

Unnamed: 0,news_id,url,publisher,publish_date,author,title,image,body_text,political_bias,country,reliability,alltext
0,0,nytimes.com,The New York Times,January,"['Knvul Sheikh', 'Roni Caryn Rabin']",The Coronavirus: What Scientists Have Learned ...,https://static01.nyt.com/images/2020/03/12/sci...,\nA novel respiratory virus that originated in...,Left,USA,1,"The New York Times ['Knvul Sheikh', 'Roni Cary..."
1,1,npr.org,National Public Radio (NPR),January,['Emily Feng'],Chinese Health Officials: More Die From Newly ...,https://media.npr.org/include/images/facebook-...,Chinese Health Officials: More Die From Newly ...,Center,USA,1,National Public Radio (NPR) ['Emily Feng'] Cen...
2,2,theverge.com,The Verge,January,['Nicole Wetsman'],Everything you need to know about the coronavirus,https://cdn.vox-cdn.com/thumbor/a9_Oz7cvSBKyal...,Public health experts around the globe are scr...,Left-center,USA,1,The Verge ['Nicole Wetsman'] Left-center Every...
3,3,worldhealth.net,WorldHealth.Net,January,[],Novel Coronavirus Cases Confirmed To Be Spreading,https://www.worldhealth.net/media/original_ima...,The first two coronavirus cases in Europe have...,NEUTRAL,USA,0,WorldHealth.Net [] NEUTRAL Novel Coronavirus C...
4,4,theverge.com,The Verge,January,"['Nicole Wetsman', 'Zoe Schiffer', 'Jay Peters...",Coronavirus disrupts the world: updates on the...,https://cdn.vox-cdn.com/thumbor/t2gt1SmEni4Mcr...,"A new coronavirus appeared in Wuhan, China, at...",Left-center,USA,1,"The Verge ['Nicole Wetsman', 'Zoe Schiffer', '..."


#### Preprocessing | Basic Steps

    1. Lowercase. 
    2. Stop words.
    3. Punctuation/Symbols.
    4. Apostophre.
    5. Single characters.
    6. Stemming.
    7. Lemmatisation.
    8. Converting Numbers.

In [4]:
def toLowerCase(df):
    df['alltext'] = df['alltext'].str.lower()
    return df

def delStopWords(df):
    stop_words = nltk.corpus.stopwords.words('english')
    valid_allt=[]
    '''all_text = []
    for i in df['alltext']:
        i = i.split()
        all_text.append(i)
    df['alltext']=all_text'''
    for i in df['alltext']:
        valid_t=[]
        for j in i:
            if j not in stop_words:
                valid_t.append(j)
        valid_allt.append(valid_t)
    df['alltext']=valid_allt
    return df

def toListOfLists(df):
    all_text = []
    for i in df['alltext']:
        i = i.split()
        all_text.append(i)
    df['alltext']=all_text
    return df  

def delSymbols(df):
    symbols = "!\"“‘’”#$%&()*+-./:;<,=>?@[\]^_`{|}~\n►●…"
    valid_allt=[]
    for i in df['alltext']:
        valid_t = []
        for j in i:
            for k in symbols:
                if k in j:
                    j = j.replace(k, "")
            valid_t.append(j)
        if "[]" in valid_t:
            valid_t.remove("[]")
        valid_allt.append(valid_t)
    df['alltext']=valid_allt
    return df

def delApostrophe(df):
    stop_words = nltk.corpus.stopwords.words('english')
    valid_alltext=[]
    for i in df['alltext']:
        valid_text=[]
        for j in i:
            if j not in stop_words:
                if "'" in j:
                    j=j.replace("'", "")
                valid_text.append(j)
        valid_alltext.append(valid_text)
    df['alltext']=valid_alltext
    return df

def delSingleChars(df):
    valid_allt=[]
    for i in df['alltext']:
        valid_t = []
        for j in i:
            if len(j)>1:
                valid_t.append(j)
        valid_allt.append(valid_t)
    df['alltext']=valid_allt
    return df

#NOMBRES PROPIOS CON PUNTOS.
#Get names w/ dots
def remainDomains(df):
    domain_dot=[]
    for i in df['publisher']:
        if "." in i:
            domain_dot.append(str(i).lower())
    domain_dot = np.unique(domain_dot)
    #Rename those who aren't w/ dot but has to be with dot
    valid_allt=[]
    for i in df['alltext']:
        valid_t=[]
        for j in i:
            for k in domain_dot:
                if "." in k:
                    k_wodot = k.replace(".", "")
                    if k_wodot in j:
                        j = j.replace(j, k)
            valid_t.append(j)
        valid_allt.append(valid_t)
    df['alltext']=valid_allt    
    return df

#THE ORDER IS LEMMATIZATION THEN STEMMING, OR JUST STEMMING.
def lemmatize(df):
    wnl = WordNetLemmatizer()
    valid_allt=[]
    for i in df['alltext']:
        valid_t=[]
        for j in i:
            j = wnl.lemmatize(j)
            valid_t.append(j)
        valid_allt.append(valid_t) 
    df['alltext']=valid_allt
    return df

#STEMMING
def stem(df):
    stemmer = PorterStemmer()
    valid_allt=[]
    for i in df['alltext']:
        valid_t=[]
        for j in i:
            j = stemmer.stem(j)
            valid_t.append(j)
        valid_allt.append(valid_t) 
    df['alltext']=valid_allt
    return df
    #df.to_csv('preprocessing-numbers.csv', index=False)
    
#converting numbers
def num_conversion(j):
    k, l = j.split()
    k = float(k)
    l = float(l)
    j = str(k*l)
    return j

def convertNumbers(df):
    valid_allt=[]
    for i in df['alltext']:
        valid_t = []
        for j in i:
            if j.isnumeric():
                #print(j)
                if "½" in j:
                    j = j.replace("½", " 0.5")
                    j = num_conversion(j)
                if "¼" in j:
                    #print(j)
                    j = j.replace("¼", " 0.25")
                    j = num_conversion(j)
                if "⅔" in j:
                    j = j.replace("⅔", " 0.67")
                    j = num_conversion(j)
                if "¾" in j:
                    j = j.replace("¾", " 0.75")
                    j = num_conversion(j)
                if "⅓" in j:
                    j = j.replace("⅓", " 0.33")
                    j = num_conversion(j)
                j = num2words.num2words(float(j))
            valid_t.append(j)          
        valid_allt.append(valid_t)
    df['alltext']=valid_allt
    return df
    #df.to_csv('preprocessing-no_numbers.csv', index=False)

In [5]:
def preprocess(df):
    df = toLowerCase(df)
    df = toListOfLists(df)
    df = delStopWords(df)
    df = delSymbols(df)
    df = delStopWords(df)
    df = delApostrophe(df)
    df = delSingleChars(df)
    df = lemmatize(df)
    df = stem(df)
    df = convertNumbers(df)
    df = delSymbols(df)
    df = remainDomains(df)

In [6]:
preprocess(df)
#df.head()

## TF-IDF Score

#### TF (Term Frequency). TF is individual to each document and word. Frequency of a term in relation to the doc that belongs to.

In [7]:
#TF(t,d) = count of t in d / number of words in d
#Array of dicts. Where each dict, Key: unique word in doc. Value: frequency of that word appearing in that document. 
tf_all = []
n_docs = len(df['alltext'])
#Dict. Key: news_id. Value: total number of words of that new.
n_words_per_doc = {}
w = 0
for i in df['alltext']:
    term_freq = {} #for each doc
    uniq_vals = pd.unique(i) #uniq vals for each doc
    n_words = len(i) # number of words for each doc
    n_words_per_doc[w] = n_words #dict/set of doc - n_words per doc
    w+=1
    n_uniq_vals = len(uniq_vals)
    for x in range(n_uniq_vals): #recorremos todos los valores unicos por doc
        n=0
        temp = uniq_vals[x]
        for j in i:
            if j==temp: 
                n+=1
        term_freq[temp]=n #frecuencia de términos por doc
    tf_all.append(term_freq) #array of dicts de term-termfreq
tf_per_doc = []
for x in range(n_docs): 
    tf = {}
    dict_ = tf_all[x] #dict de frequencies de doc
    nw = n_words_per_doc[x] #número de words de doc
    for y in dict_.keys(): #recorrer keys: unique vals. por cada uniq val in doc
        freq = dict_.get(y) #get freq
        #Calcular tf. freq/#words
        tf_ = freq/nw #normalizamos
        tf[y]=tf_ #agregamos a dict
    tf_per_doc.append(tf) #array de dicts de term-termfreq normalizado

#### DF (Document Frequency). Number of documents in which the word is present.

In [8]:
#df(t) = occurrence of t in N documents.
#Get all distinct words in all the docs.
words_ = []
n=0
for x in range(n_docs): 
    dict_ = tf_all[x] #por cada dict de frequencies de cada doc
    for y in dict_.keys(): #recorrer keys: unique vals.
        words_.append(y)
        n+=1
words = np.unique(words_) #get todas las palabras distintas que aparecen en todos los docs
nw = len(words) #numero de palabras distitnas en todos los docs
n_news = len(tf_all)
doc_freq={}
for x in range(nw):
    n=0
    temp = words[x]
    for y in range(n_news):
        dict_ = tf_all[y]
        vals = dict_.keys()
        if temp in vals:
            n+=1 #si palabra presente en cada dict de cada doc, sumamos más 1 (nos interesa presencia, no frecuencia)
            continue
    doc_freq[temp] = n

#### IDF (Inverse Document Frequency). Informativeness of term t.

In [9]:
#idf(t) = N/df. Df are the values for each key (unique words in all the docs). And N, the number of docs.
#idf(t) = log(N/(df + 1)).
idf={}
n_docs = len(df['alltext'])
for x in doc_freq.keys(): #recorremos dict de df que vamos a invertir
    df_=doc_freq.get(x)
    idf_=math.log(n_docs/(df_+1))
    idf[x]=idf_

#### TF-IDF (Term Frequency - Inverse Document Frequency).

In [10]:
#tf-idf(t, d) = tf(t, d) * log(N/(df + 1)).
tf_idf_allt = []
for x in range(len(tf_per_doc)):
    tf_idf={}
    dict_tf_doc = tf_per_doc[x]
    for y in dict_tf_doc.keys():
        tf_r = dict_tf_doc.get(y)
        idf_r = idf.get(y)
        tf_idf_ = tf_r*idf_r
        tf_idf[y]=tf_idf_
    tf_idf_allt.append(tf_idf) 
df['tf-idf']=tf_idf_allt

In [11]:
df.head()

Unnamed: 0,news_id,url,publisher,publish_date,author,title,image,body_text,political_bias,country,reliability,alltext,tf-idf
0,0,nytimes.com,The New York Times,January,"['Knvul Sheikh', 'Roni Caryn Rabin']",The Coronavirus: What Scientists Have Learned ...,https://static01.nyt.com/images/2020/03/12/sci...,\nA novel respiratory virus that originated in...,Left,USA,1,"[new, york, time, knvul, sheikh, roni, caryn, ...","{'new': 0.005144562828399132, 'york': 0.001377..."
1,1,npr.org,National Public Radio (NPR),January,['Emily Feng'],Chinese Health Officials: More Die From Newly ...,https://media.npr.org/include/images/facebook-...,Chinese Health Officials: More Die From Newly ...,Center,USA,1,"[nation, public, radio, npr, emili, feng, cent...","{'nation': 0.0034242938862532523, 'public': 0...."
2,2,theverge.com,The Verge,January,['Nicole Wetsman'],Everything you need to know about the coronavirus,https://cdn.vox-cdn.com/thumbor/a9_Oz7cvSBKyal...,Public health experts around the globe are scr...,Left-center,USA,1,"[verg, nicol, wetsman, leftcent, everyth, need...","{'verg': 0.0038149774052306145, 'nicol': 0.003..."
3,3,worldhealth.net,WorldHealth.Net,January,[],Novel Coronavirus Cases Confirmed To Be Spreading,https://www.worldhealth.net/media/original_ima...,The first two coronavirus cases in Europe have...,NEUTRAL,USA,0,"[worldhealth.net, neutral, novel, coronaviru, ...","{'worldhealth.net': 0.004356187002041755, 'neu..."
4,4,theverge.com,The Verge,January,"['Nicole Wetsman', 'Zoe Schiffer', 'Jay Peters...",Coronavirus disrupts the world: updates on the...,https://cdn.vox-cdn.com/thumbor/t2gt1SmEni4Mcr...,"A new coronavirus appeared in Wuhan, China, at...",Left-center,USA,1,"[verg, nicol, wetsman, zoe, schiffer, jay, pet...","{'verg': 0.01878670527971479, 'nicol': 0.03134..."
