# Preprocessing data | recovery-news-data.csv

*CS 539 - Social Media Mining | Francesca Spezzano*

*Computer Science | Boise State University*

*11.22.2022 | Fall 2022*

*Aida Gomezbueno Berezo | aidagomezbuenobe@u.boisestate.edu*

Launching notebook with the following command:

*jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e1000000*

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
import time
import datetime
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
import nltk
from nltk.corpus import names, stopwords, words
from nltk.stem import *
import num2words

In [2]:
data = pd.read_csv(r'recovery-news-data.csv')
df = pd.DataFrame(data)
df.columns = ['index', 'news_id', 'url', 'publisher', 'publish_date', 'author', 'title', 'image', 'body_text', 'political_bias', 'country', 'reliability']
df = df.drop('index', axis=1)
df.head()

Unnamed: 0,news_id,url,publisher,publish_date,author,title,image,body_text,political_bias,country,reliability
0,0,https://www.nytimes.com/article/what-is-corona...,The New York Times,2020-01-21,"['Knvul Sheikh', 'Roni Caryn Rabin']",The Coronavirus: What Scientists Have Learned ...,https://static01.nyt.com/images/2020/03/12/sci...,\nA novel respiratory virus that originated in...,Left,USA,1
1,1,https://www.npr.org/2020/01/22/798392172/chine...,National Public Radio (NPR),2020-01-22,['Emily Feng'],Chinese Health Officials: More Die From Newly ...,https://media.npr.org/include/images/facebook-...,Chinese Health Officials: More Die From Newly ...,Center,USA,1
2,2,https://www.theverge.com/2020/1/23/21078457/co...,The Verge,2020-01-23,['Nicole Wetsman'],Everything you need to know about the coronavirus,https://cdn.vox-cdn.com/thumbor/a9_Oz7cvSBKyal...,Public health experts around the globe are scr...,Left-center,USA,1
3,3,https://www.worldhealth.net/news/novel-coronav...,WorldHealth.Net,2020-01-24,[],Novel Coronavirus Cases Confirmed To Be Spreading,https://www.worldhealth.net/media/original_ima...,The first two coronavirus cases in Europe have...,,USA,0
4,4,https://www.theverge.com/2020/1/24/21080845/co...,The Verge,2020-01-24,"['Nicole Wetsman', 'Zoe Schiffer', 'Jay Peters...",Coronavirus disrupts the world: updates on the...,https://cdn.vox-cdn.com/thumbor/t2gt1SmEni4Mcr...,"A new coronavirus appeared in Wuhan, China, at...",Left-center,USA,1


In [3]:
#CLEANING URLS
clean_subs = ['https://', 'http://', 'www.']
substring = "/"
urls=[]
for i in df['url']:
    #print(i)
    for j in clean_subs:
        if j in i:
            i = i.split(j)
            i = i[1]
    if substring in i:
        i = i.split(substring)
        i = i[0]
    urls.append(i)
df['url']=urls

df['publish_date'] = df['publish_date'].fillna("1970-11-01")
epoch = datetime.utcfromtimestamp(0)
for i in df['publish_date']:
    date_object = datetime.strptime(str(i), '%Y-%m-%d').date()
    delta = date_object - epoch.date()
    dt = datetime.fromtimestamp(int(delta.total_seconds())).strftime('%Y-%m-%d')
    mth = datetime.fromtimestamp(int(delta.total_seconds())).strftime('%B')
    if mth=="October":
        mth = "UNKNOWN"
    df['publish_date'] = df['publish_date'].replace(i, mth)
df['author'] = df['author'].fillna("")
replace_simb = ['[', ']', "'", ' etc.', "‘"]
coma = ", "
array_col = []
for i in df['author']:
    temp = []
    for j in replace_simb:
        if j in i:
            i = i.replace(j, "")
            
df['political_bias'] = df['political_bias'].fillna("NEUTRAL")
df['image'] = df['image'].fillna("")
df['title'] = df['title'].fillna("")
df['country'] = df['country'].fillna("UNKNOWN")

df['alltext'] = df['publisher'] + " " + df['author'] + " " +df['political_bias'] + " " + df['title'] + " " + df['body_text'] 

df.head()

Unnamed: 0,news_id,url,publisher,publish_date,author,title,image,body_text,political_bias,country,reliability,alltext
0,0,nytimes.com,The New York Times,January,"['Knvul Sheikh', 'Roni Caryn Rabin']",The Coronavirus: What Scientists Have Learned ...,https://static01.nyt.com/images/2020/03/12/sci...,\nA novel respiratory virus that originated in...,Left,USA,1,"The New York Times ['Knvul Sheikh', 'Roni Cary..."
1,1,npr.org,National Public Radio (NPR),January,['Emily Feng'],Chinese Health Officials: More Die From Newly ...,https://media.npr.org/include/images/facebook-...,Chinese Health Officials: More Die From Newly ...,Center,USA,1,National Public Radio (NPR) ['Emily Feng'] Cen...
2,2,theverge.com,The Verge,January,['Nicole Wetsman'],Everything you need to know about the coronavirus,https://cdn.vox-cdn.com/thumbor/a9_Oz7cvSBKyal...,Public health experts around the globe are scr...,Left-center,USA,1,The Verge ['Nicole Wetsman'] Left-center Every...
3,3,worldhealth.net,WorldHealth.Net,January,[],Novel Coronavirus Cases Confirmed To Be Spreading,https://www.worldhealth.net/media/original_ima...,The first two coronavirus cases in Europe have...,NEUTRAL,USA,0,WorldHealth.Net [] NEUTRAL Novel Coronavirus C...
4,4,theverge.com,The Verge,January,"['Nicole Wetsman', 'Zoe Schiffer', 'Jay Peters...",Coronavirus disrupts the world: updates on the...,https://cdn.vox-cdn.com/thumbor/t2gt1SmEni4Mcr...,"A new coronavirus appeared in Wuhan, China, at...",Left-center,USA,1,"The Verge ['Nicole Wetsman', 'Zoe Schiffer', '..."


#### Basic Steps

    1. Lowercase. 
    2. Stop words.
    3. Punctuation.
    4. Apostophre.
    5. Single characters.

In [4]:
df['alltext'] = df['alltext'].str.lower()

In [5]:
stop_words = nltk.corpus.stopwords.words('english')
all_text = []
valid_alltext=[]
for i in df['alltext']:
    i = i.split()
    all_text.append(i)
df['alltext']=all_text
for i in df['alltext']:
    valid_text=[]
    for j in i:
        if j not in stop_words:
            valid_text.append(j)
    valid_alltext.append(valid_text)
df['alltext']=valid_alltext

In [6]:
symbols = "!\"“#$%&()*+-./:;<,=>?@[\]^_`{|}~\n"
valid_allt=[]
for i in df['alltext']:
    valid_t = []
    for j in i:
        for k in symbols:
            if k in j:
                j = j.replace(k, "")
        valid_t.append(j)
    if [] in valid_t:
        valid_t.remove([])
    valid_allt.append(valid_t)
df['alltext']=valid_allt

In [7]:
valid_alltext=[]
for i in df['alltext']:
    valid_text=[]
    for j in i:
        if j not in stop_words:
            if "'" in j:
                j=j.replace("'", "")
            valid_text.append(j)
    valid_alltext.append(valid_text)
df['alltext']=valid_alltext

In [8]:
valid_allt=[]
for i in df['alltext']:
    valid_t = []
    for j in i:
        if len(j)>1:
            valid_t.append(j)
    valid_allt.append(valid_t)
df['alltext']=valid_allt

In [9]:
#NOMBRES PROPIOS CON PUNTOS.
#Get names w/ dots
domain_dot=[]
for i in df['publisher']:
    if "." in i:
        domain_dot.append(str(i).lower())
domain_dot = np.unique(domain_dot)
#Rename those who aren't w/ dot but has to be with dot
valid_allt=[]
for i in df['alltext']:
    valid_t=[]
    for j in i:
        for k in domain_dot:
            if "." in k:
                k_wodot = k.replace(".", "")
                if k_wodot in j:
                    j = j.replace(j, k)
        valid_t.append(j)
    valid_allt.append(valid_t)
df['alltext']=valid_allt    

In [10]:
#THE ORDER IS LEMMATIZATION THEN STEMMING, OR JUST STEMMING.
wnl = WordNetLemmatizer()
valid_allt=[]
for i in df['alltext']:
    valid_t=[]
    for j in i:
        j = wnl.lemmatize(j)
        valid_t.append(j)
    valid_allt.append(valid_t) 
df['alltext']=valid_allt

In [11]:
#STEMMING
stemmer = PorterStemmer()
valid_allt=[]
for i in df['alltext']:
    valid_t=[]
    for j in i:
        j = stemmer.stem(j)
        valid_t.append(j)
    valid_allt.append(valid_t) 
df['alltext']=valid_allt
df.to_csv('preprocessing-numbers.csv', index=False)

In [12]:
#converting numbers
def num_conversion(j):
    k, l = j.split()
    k = float(k)
    l = float(l)
    j = str(k*l)
    return j

valid_allt=[]
for i in df['alltext']:
    valid_t = []
    for j in i:
        if j.isnumeric():
            print(j)
            if "½" in j:
                j = j.replace("½", " 0.5")
                j = num_conversion(j)
            if "¼" in j:
                print(j)
                j = j.replace("¼", " 0.25")
                j = num_conversion(j)
            if "⅔" in j:
                j = j.replace("⅔", " 0.67")
                j = num_conversion(j)
            if "¾" in j:
                j = j.replace("¾", " 0.75")
                j = num_conversion(j)
            if "⅓" in j:
                j = j.replace("⅓", " 0.33")
                j = num_conversion(j)
            j = num2words.num2words(float(j))
        valid_t.append(j)          
    valid_allt.append(valid_t)
df['alltext']=valid_allt
df.to_csv('preprocessing-no_numbers.csv', index=False)

20000
2003
8098
774
2012
14
200
2003
20
2015
200
440
15
12
60
450
1700
40
2003
800
2020
2019
2002
2012
96
160
214
200000
17
80
80
15
14
15
14
80
13
23
14
25
18
830
26
8420
60
35
2002
8000
774
20
35
2002
2012
25
25
106
24
81
22
2700
40
22
11
17
50
800
1500
1000
2020
18221895
78
70
23
2020
2003
400000
7080
10
1030
24
65
12
2020
8000
1307
1307
1307
1307
1307
8000
2020
100
300
100
50000
1916
57
2003
7000
600
80
14
70
2020
2003
50
1979
1982
14
999
1987
2000
2002
2001
2012
1994
1999
2007
006012
2017
2015
2009
2017
35
2019
25000
20
24
35
154024
24589
24589
1546
154023
20979
920
34
402
402
35000
700
800
12
630
32000
2020
2003
20
36
72
10000
40000
20
400
78
72
5050
380
500000
100
45
65
15
90
100
10
11
10
12
30
13
60
12
15
17
2020
20192020
6666
5012020
5142020
59
6666
100
6666
6666
6666
100
100
2020
6666
6666
100
100300000
100
15
80
15
23
2020
14300
2020
250
17
5000
10004000
78000
60
63
73
615
52
710
615
52
597
35
40
98
615
28
38
13
29
76
13
43
28
16
17
38
45
14
26
54
28
12
58
90
28
19
31
1017
2

2019
2019
2900
200
28
2900
200
28
20
2000
21000
30
1932
1972
81
22
22
18
2019
2020
1989
2014
2017
16
2019
16
1998
2019
2020
100000
10000
2020
2018
2018
100
50
19
2018
15
22
55
40000
40000
30000
5008
2015
16000
25
576
750
2019
24
82000
2016
2000
20
6000
280
2024
20
17
95
03
212
47
165
60
20
159
21
35
81
53
20126425767
2014497100
2020605864877
20081912126
201519513721376
201741127184
201321
2013415
20046203208
10
200717512901297
11
2020117995998
12
13
1989170556560
78
10
100000
2021
20
20
20
14
2020
1970
1955
2020
2020
2020
1990
57
70
1991
2017
1970
19
49
20
20
15
20
95
12
10
13
11
74
56
12
90
80
15
84000
2017
2020
82000
80000
16
93
66
10
1957
14
95
11
2018
2020
59
11
2000
11
2001
70
1979
26
41
2003
2005
2006
66
58
1993
70
1973
2001
2001
2007
2008
93
20
21
17
12
84
1992
2000
2000
46
28
16
28
2020
60
1994
15
91
66
1982
1999
13
67
62
20
1994
2008
28
63
53
20
60
27
26
32
24
2014
17
86
15
2014
135
175
18
62
19
30
19
2020
13
58
2016
20
90
14
18
15
16
1957
36
13
77
19
2020
43
30
88
88
2020
50


In [13]:
df

Unnamed: 0,news_id,url,publisher,publish_date,author,title,image,body_text,political_bias,country,reliability,alltext
0,0,nytimes.com,The New York Times,January,"['Knvul Sheikh', 'Roni Caryn Rabin']",The Coronavirus: What Scientists Have Learned ...,https://static01.nyt.com/images/2020/03/12/sci...,\nA novel respiratory virus that originated in...,Left,USA,1,"[new, york, time, knvul, sheikh, roni, caryn, ..."
1,1,npr.org,National Public Radio (NPR),January,['Emily Feng'],Chinese Health Officials: More Die From Newly ...,https://media.npr.org/include/images/facebook-...,Chinese Health Officials: More Die From Newly ...,Center,USA,1,"[nation, public, radio, npr, emili, feng, cent..."
2,2,theverge.com,The Verge,January,['Nicole Wetsman'],Everything you need to know about the coronavirus,https://cdn.vox-cdn.com/thumbor/a9_Oz7cvSBKyal...,Public health experts around the globe are scr...,Left-center,USA,1,"[verg, nicol, wetsman, leftcent, everyth, need..."
3,3,worldhealth.net,WorldHealth.Net,January,[],Novel Coronavirus Cases Confirmed To Be Spreading,https://www.worldhealth.net/media/original_ima...,The first two coronavirus cases in Europe have...,NEUTRAL,USA,0,"[worldhealth.net, neutral, novel, coronaviru, ..."
4,4,theverge.com,The Verge,January,"['Nicole Wetsman', 'Zoe Schiffer', 'Jay Peters...",Coronavirus disrupts the world: updates on the...,https://cdn.vox-cdn.com/thumbor/t2gt1SmEni4Mcr...,"A new coronavirus appeared in Wuhan, China, at...",Left-center,USA,1,"[verg, nicol, wetsman, zoe, schiffer, jay, pet..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2024,2024,msn.com,Drudge Report,UNKNOWN,"['Alex Wigglesworth', 'Luke Money', 'Noah Bier...",White House concerned with coronavirus spread ...,https://img-s-msn-com.akamaized.net/tenant/amp...,© Robert Gauthier/Los Angeles Times/TNS People...,Right,USA,0,"[drudg, report, alex, wigglesworth, luke, mone..."
2025,2025,vanityfair.com,Drudge Report,UNKNOWN,['Tom Kludt'],“There Will Be No Election If Things Keep Goin...,https://media.vanityfair.com/photos/5ec6bdc5ca...,"Michael Moore, for decades on the political fr...",Right,USA,0,"[drudg, report, tom, kludt, right, elect, thin..."
2026,2026,msn.com,Drudge Report,UNKNOWN,['Jonathan Lai'],Want to know who won the presidential race on ...,https://img-s-msn-com.akamaized.net/tenant/amp...,© HEATHER KHALIFA/The Philadelphia Inquirer/TN...,Right,USA,0,"[drudg, report, jonathan, lai, right, want, kn..."
2027,2027,fox5ny.com,Drudge Report,UNKNOWN,['Kelly Taylor Hayes'],Nearly half of Twitter accounts discussing cor...,https://images.foxtv.com/static.fox5ny.com/www...,Nearly half of the Twitter accounts sharing in...,Right,USA,0,"[drudg, report, kelli, taylor, hay, right, nea..."
