# One Hot Encoding | recovery-news-data.csv

*CS 539 - Social Media Mining | Francesca Spezzano*

*Computer Science | Boise State University*

*11.21.2022 | Fall 2022*

*Aida Gomezbueno Berezo | aidagomezbuenobe@u.boisestate.edu*

In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import time
import datetime
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
#from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer

#### Download & prepare dataset

In [14]:
#Download
data = pd.read_csv(r'recovery-news-data.csv')
df = pd.DataFrame(data)
df.columns = ['index', 'news_ID', 'url', 'publisher', 'publish_date', 'author', 'title', 'image', 'body_text', 'political_bias', 'country', 'reliability']
feature_cols = ['url', 'publisher', 'publish_date', 'author', 'title', 'body_text', 'political_bias', 'country']
#Reliability is the label as: 1 - real, 0 - fake.
label = ['reliability']
X = df[feature_cols]
Y = df[label]
df.head()

Unnamed: 0,index,news_ID,url,publisher,publish_date,author,title,image,body_text,political_bias,country,reliability
0,0,0,https://www.nytimes.com/article/what-is-corona...,The New York Times,2020-01-21,"['Knvul Sheikh', 'Roni Caryn Rabin']",The Coronavirus: What Scientists Have Learned ...,https://static01.nyt.com/images/2020/03/12/sci...,\nA novel respiratory virus that originated in...,Left,USA,1
1,1,1,https://www.npr.org/2020/01/22/798392172/chine...,National Public Radio (NPR),2020-01-22,['Emily Feng'],Chinese Health Officials: More Die From Newly ...,https://media.npr.org/include/images/facebook-...,Chinese Health Officials: More Die From Newly ...,Center,USA,1
2,2,2,https://www.theverge.com/2020/1/23/21078457/co...,The Verge,2020-01-23,['Nicole Wetsman'],Everything you need to know about the coronavirus,https://cdn.vox-cdn.com/thumbor/a9_Oz7cvSBKyal...,Public health experts around the globe are scr...,Left-center,USA,1
3,3,3,https://www.worldhealth.net/news/novel-coronav...,WorldHealth.Net,2020-01-24,[],Novel Coronavirus Cases Confirmed To Be Spreading,https://www.worldhealth.net/media/original_ima...,The first two coronavirus cases in Europe have...,,USA,0
4,4,4,https://www.theverge.com/2020/1/24/21080845/co...,The Verge,2020-01-24,"['Nicole Wetsman', 'Zoe Schiffer', 'Jay Peters...",Coronavirus disrupts the world: updates on the...,https://cdn.vox-cdn.com/thumbor/t2gt1SmEni4Mcr...,"A new coronavirus appeared in Wuhan, China, at...",Left-center,USA,1


#### Data processing

In [15]:
#FEATURE: URL
df['url'] = df['url'].fillna("0")

#CLEANING URLS
clean_subs = ['https://', 'http://', 'www.']
substring = "/"
urls=[]
for i in df['url']:
    #print(i)
    for j in clean_subs:
        if j in i:
            i = i.split(j)
            i = i[1]
    if substring in i:
        i = i.split(substring)
        i = i[0]
    urls.append(i)
df['url']=urls
df = pd.get_dummies(df, columns=['url'])

#FEATURE: PUBLISHER
df['publisher'] = df['publisher'].fillna("0")
df = pd.get_dummies(df, columns=['publisher'])

#FEATURE: PUBLISH_DATE
df['publish_date'] = df['publish_date'].fillna("1970-01-01")
epoch = datetime.utcfromtimestamp(0)
subs = '1969'
for i in df['publish_date']:
    date_object = datetime.strptime(str(i), '%Y-%m-%d').date()
    delta = date_object - epoch.date()
    dt = int(delta.total_seconds())
    dt = datetime.fromtimestamp(dt).strftime('%Y-%m-%d')
    mth = datetime.fromtimestamp(int(delta.total_seconds())).strftime('%B')
    if subs in dt:
        mth = "0"
    df['publish_date'] = df['publish_date'].replace(i, mth)
df = pd.get_dummies(df, columns=['publish_date'])
                                                    
#FEATURE: AUTHOR
df['author'] = df['author'].fillna("")
replace_simb = ['[', ']', "'", ' etc.', "‘"]
coma = ", "
array_col = []
for i in df['author']:
    temp = []
    for j in replace_simb:
        if j in i:
            i = i.replace(j, "")
    if coma in i:
        i = i.split(coma)
    else:
        temp.append(i)
        i = temp
    array_col.append(i)
df['author'] = array_col
mlb = MultiLabelBinarizer(sparse_output=True)
df = df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(array_col),
                index=df.index,
                columns=mlb.classes_))
df = df.drop('author', axis=1)
df.to_csv('onehotencoding.csv', index=False)    

#FEATURE: POLITICAL_BIAS
df['political_bias'] = df['political_bias'].fillna("0")
df = pd.get_dummies(df, columns=['political_bias'])

#FEATURE: COUNTRY
df['country'] = df['country'].fillna("0")
df = pd.get_dummies(df, columns=['country'])   

rel=[]
for i in df['reliability']:
    rel.append(i)
df = df.drop('reliability', axis=1)
df['reliability'] = rel
df.head()

Unnamed: 0,index,news_ID,title,image,body_text,url_abcnews.go.com,url_activistpost.com,url_americanthinker.com,url_amgreatness.com,url_armytimes.com,...,political_bias_Right,political_bias_Right-center,country_0,country_Canada,country_Cyprus,country_Iran,country_Russia,country_UK,country_USA,reliability
0,0,0,The Coronavirus: What Scientists Have Learned ...,https://static01.nyt.com/images/2020/03/12/sci...,\nA novel respiratory virus that originated in...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,1,1,Chinese Health Officials: More Die From Newly ...,https://media.npr.org/include/images/facebook-...,Chinese Health Officials: More Die From Newly ...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,2,2,Everything you need to know about the coronavirus,https://cdn.vox-cdn.com/thumbor/a9_Oz7cvSBKyal...,Public health experts around the globe are scr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,3,3,Novel Coronavirus Cases Confirmed To Be Spreading,https://www.worldhealth.net/media/original_ima...,The first two coronavirus cases in Europe have...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,4,4,Coronavirus disrupts the world: updates on the...,https://cdn.vox-cdn.com/thumbor/t2gt1SmEni4Mcr...,"A new coronavirus appeared in Wuhan, China, at...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


### Train Model | Compute Accuracy

In [16]:
#SPLIT DATA
col = 'reliability'
_df = df
_df = _df.drop('body_text', axis=1)
_df = _df.drop('title', axis=1)
_df = _df.drop('image', axis=1)
X = _df.loc[:, _df.columns != col]
Y = _df[col]
X_train,X_test,Y_train,Y_test=train_test_split(X, Y, test_size=0.3, random_state=25)

#TRAIN MODEL
model = LogisticRegression()
model.fit(X_train,Y_train.values.ravel())
Y_pred = model.predict(X_test)

#PRINT ACCURACY
print("Accuracy: %.2f%%" % (metrics.accuracy_score(Y_test, Y_pred)*100))

Accuracy: 100.00%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
