In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as sp
ru=pd.read_csv('../../data/00-raw-data/rumor.csv')
#There are mutiple languages in the data set,so I firstly drop the unstructured data which is not written in English.
ru=ru[ru["language"]=="english"]
#since the data set is collected by crawled during a certain period, so I drop the date of crawling and in case it will cause skew in the model trained
ru.drop(labels=["crawled","published"],axis=1,inplace=True)
#Columns like id, img_url,spam_score are useless, we need to drop them. The values of ord_in_thread are 0 for all rows, which is meaningless. We also drop it.
ru.drop(labels=["main_img_url","spam_score"],axis=1,inplace=True)
#add a label
ru["label"]="rumor"



In [2]:
ru.isna().sum()# see whether there are some na value

uuid                     0
ord_in_thread            0
author                2209
title                  680
text                    46
language                 0
site_url                 0
country                176
domain_rank           4175
thread_title            12
replies_count            0
participants_count       0
likes                    0
comments                 0
shares                   0
type                     0
label                    0
dtype: int64

In [3]:
# I transformed the value of author as 0/1. 0 means anonymous, while 1 means the name of writer is recorded. All Na values were also be considered as anonymous.
ru["author"]=ru["author"].fillna('Anonymous')
ru["author"]=ru["author"].apply(lambda x:0 if x== "Anonymous" else 1 )
# We cannot replace data with missing value of unstructed data. There is not a properly way to fill na in country.So we just drop certain rows.
ru.drop(ru[ru["title"].isna() | ru["text"].isna() | ru["country"].isna()].index,inplace=True)
#For domain_rank column, we fill na with the median becasue the range of the values is huge.
ru["domain_rank"].fillna(ru["domain_rank"].median())
#Finally we delete the rows where its type is "satire","junksci" and "state" to ensure it is a rumor. This will increase the quality of the data
ru=ru[~ru["type"].isin(["satire","junksci","state"])]

In [4]:
# clean the sentence so that we only have English words.
ru["title"]=ru['title'].apply(lambda x:re.sub('[^a-zA-Z]',' ',x))

In [5]:
#create the model and fit the model to the data
vectorizer = CountVectorizer()
# change the dtype to int32 to decrease the size of data.
matrix = vectorizer.fit_transform(ru.title).astype("int32")
#transform the result to a dataframe
counts = pd.DataFrame(matrix.toarray(),columns=vectorizer.get_feature_names_out(),index=ru.uuid)
#merge the dfs
ru=pd.merge(ru,counts,on="uuid")
#drop the uuid,which is useless
ru.drop(columns=("uuid"),inplace=True)

In [6]:
#output the data
ru.to_csv("../../data/01-modified-data/labeled_rumor_python.csv",index=False)