# Data Collation
Taking our raw dataset(s) we want to use, and performing any preprocessing / feature engineering as well as train/test, X/y splits for ease of use in classifier training/evaluation.

## Imports

In [36]:
# !pip install google_trans_new
!pip install googletrans==4.0.0rc1
!pip install -U deep-translator

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting googletrans==4.0.0rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0rc1)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading hstspreload-2023.1.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [37]:
from deep_translator import GoogleTranslator
#from google_trans_new import google_translator
from googletrans import Translator
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.utils import shuffle

## Constants

In [58]:
# Dataset filenames
# constraint dataset (https://github.com/diptamath/covid_fake_news)
CONSTRAINT_TRAIN_FILE = "Constraint_Train.csv"
CONSTRAINT_VAL_FILE = "Constraint_Val.csv"
CONSTRAINT_TEST_FILE = "Constraint_Test.csv"

# CHECKED dataset (https://github.com/cyang03/CHECKED/tree/ff3055c4a3c1ebeac80a1e94050490048dfe583f/dataset)
CHECKED_TRUE_FILE = "real_news.csv"
CHECKED_FALSE_FILE = "fake_news.csv"

TEXT_FEATURE_NAME = "text"
DATASET_COLUMN_NAME = "dataset"

CONSTRAINT_DATASET_LABEL = "constraint"
CHECKED_DATASET_LABEL = "checked"

SPLIT_TRAIN_FRAC = 0.8

## Mounting Google Drive

In [6]:
GOOGLE_DRIVE_MOUNT_PATH_PREFIX = '/content/drive'
MY_CS152_DATA_FILE_PATH = "drive/MyDrive/Senior/SenSpr/CS152/CS152 Group Project/Milestone 3/Code/Data/" # NOTE: you have to modify this to fit wherever the CS152 Group Project/Milestone 3/Code/Data is in your Google Drive

In [7]:
from google.colab import drive
drive.mount(GOOGLE_DRIVE_MOUNT_PATH_PREFIX)

Mounted at /content/drive


In [8]:
cd $MY_CS152_DATA_FILE_PATH

/content/drive/MyDrive/Senior/SenSpr/CS152/CS152 Group Project/Milestone 3/Code/Data


# Loading in and preprocesing the datasets


### Helper Functions

In [9]:
def merge_dfs(df_list):

  # merging and randomly shuffle the dataset
  df_full = pd.concat(df_list, ignore_index = True)

  df_full = shuffle(df_full)

  # reset the index
  df_full = df_full.reset_index(drop=True)

  return df_full

### Constraint Dataset

In [10]:
constraint_train = pd.read_csv(CONSTRAINT_TRAIN_FILE)
constraint_val = pd.read_csv(CONSTRAINT_VAL_FILE)

# we won't use this since it lacks labels
# constraint_test = pd.read_csv(CONSTRAINT_TEST_FILE)

In [11]:
print("Constraint Train")
constraint_train.head()

Constraint Train


Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [12]:
print("Constraint Val")
constraint_val.head()

Constraint Val


Unnamed: 0,id,tweet,label
0,1,Chinese converting to Islam after realising th...,fake
1,2,11 out of 13 people (from the Diamond Princess...,fake
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",fake
3,4,Mike Pence in RNC speech praises Donald Trump’...,fake
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,real


In [13]:
# filtering out the columns we do not need to keep
CONSTRAINT_KEEP_COLUMNS = ["tweet", "label"]

constraint_train = constraint_train[CONSTRAINT_KEEP_COLUMNS]
constraint_val = constraint_val[CONSTRAINT_KEEP_COLUMNS]

In [65]:
# rename the text column accordingly
rename_dict = {"tweet": TEXT_FEATURE_NAME}
constraint_train.rename(columns = rename_dict, inplace = True)
constraint_val.rename(columns = rename_dict, inplace = True)

In [15]:
# add a column to indicate the dataset of origin (before we merge)
constraint_train[DATASET_COLUMN_NAME] = CONSTRAINT_DATASET_LABEL
constraint_val[DATASET_COLUMN_NAME] = CONSTRAINT_DATASET_LABEL

In [70]:
constraint_train.head()

Unnamed: 0,text,label,dataset
0,The CDC currently reports 99031 deaths. In gen...,real,constraint
1,States reported 1121 deaths a small rise from ...,real,constraint
2,Politically Correct Woman (Almost) Uses Pandem...,fake,constraint
3,#IndiaFightsCorona: We have 1524 #COVID testin...,real,constraint
4,Populous states can generate large case counts...,real,constraint


In [71]:
constraint_full = merge_dfs([constraint_train, constraint_val])

### CHECKED (Chinese) Dataset

In [18]:
checked_true = pd.read_csv(CHECKED_TRUE_FILE)
checked_false = pd.read_csv(CHECKED_FALSE_FILE)

In [19]:
checked_true.head()

Unnamed: 0,label,id,date,user_id,text,pic_url,video_url,comment_num,repost_num,like_num
0,real,be3d74871a5df3a0da2a98dd32e8469f,2020-08-23 18:57,43a971db6364c4ed7cd101feca243600,【转存！#施一公说诚实做学问是绝不容突破的底线#】今天，在西湖大学博士研究生开学典礼上，校长...,[],http://f.video.weibocdn.com/DJBbVGUwlx07FRoDja...,396,601,3955
1,real,1a213783461da197779f9f6a4f118e13,2020-02-18 17:12,43a971db6364c4ed7cd101feca243600,【#区长回应网传北京西城区政府停摆#：目前运转一切正常】2月18日，在北京市新型冠状病毒肺炎...,['https://wx1.sinaimg.cn/orj360/a716fd45ly1gc0...,,765,345,4949
2,real,89c3927b85cc27fea583677d00ee6454,2020-04-21 18:38,43a971db6364c4ed7cd101feca243600,【最新：#全球新冠肺炎感染人数超244万例#】截至北京时间4月21日16时，211个国家和地...,['https://wx3.sinaimg.cn/orj360/a716fd45gy1ge1...,,670,606,3510
3,real,799a54dfe5472de93613dab44f22014e,2020-05-01 08:21,43a971db6364c4ed7cd101feca243600,【今天，发条微博，迎接#五月#】5月，向每一位奋斗中的劳动者致敬；5月，为奋斗中的青年点赞；...,['https://wx4.sinaimg.cn/orj360/a716fd45ly1gec...,,2471,10110,23223
4,real,477c1e86e80e7b29fbf719d188d3bcca,2020-08-02 10:21,43a971db6364c4ed7cd101feca243600,【#大连关停凯洋海鲜公司及周边企业#】今天上午，大连举行新冠肺炎疫情防控工作发布会。会上通报...,[],http://f.video.weibocdn.com/9iWxb7txlx07Fjzvqh...,354,255,1980


In [20]:
# filtering out the columns we do not need to keep
CHECKED_KEEP_COLUMNS = ["text", "label"]

checked_true = checked_true[CHECKED_KEEP_COLUMNS]
checked_false = checked_false[CHECKED_KEEP_COLUMNS]

In [21]:
# display edited version
checked_true.head()

Unnamed: 0,text,label
0,【转存！#施一公说诚实做学问是绝不容突破的底线#】今天，在西湖大学博士研究生开学典礼上，校长...,real
1,【#区长回应网传北京西城区政府停摆#：目前运转一切正常】2月18日，在北京市新型冠状病毒肺炎...,real
2,【最新：#全球新冠肺炎感染人数超244万例#】截至北京时间4月21日16时，211个国家和地...,real
3,【今天，发条微博，迎接#五月#】5月，向每一位奋斗中的劳动者致敬；5月，为奋斗中的青年点赞；...,real
4,【#大连关停凯洋海鲜公司及周边企业#】今天上午，大连举行新冠肺炎疫情防控工作发布会。会上通报...,real


In [22]:
# checking we need to normalize the class label
print(checked_true["label"].unique())
print(checked_false["label"].unique())

['real']
['fake']


In [23]:
# merging and randomly shuffle the dataset
checked_full = merge_dfs([checked_true, checked_false])

In [24]:
checked_full.head()

Unnamed: 0,text,label
0,【“渝”战愈勇！#1636位重庆援鄂医务人员全名单#】十八批救援队，1636个名字，#每个名...,real
1,【#世卫组织称北京所现病毒与欧洲毒株密切相关#】当地时间19日，世卫组织卫生紧急项目技术主管...,real
2,【#4分半回顾世界战疫时间线#】2019年底至今，一场来源不明的新冠疫情侵袭人类。全球战疫仍...,real
3,【#你好，明天#】钟南山、李兰娟、陈薇……奋战在疫情防控一线的院士“天团”，一次次刷屏。他们...,real
4,【江苏宿迁#46岁辅警抗疫战中牺牲#】2月27日晚，江苏宿迁市公安局交警支队三大队一中队辅警...,real


In [25]:
# adding a column to indicate dataset of origin
checked_full[DATASET_COLUMN_NAME] = CHECKED_DATASET_LABEL

In [26]:
# remove any rows with empty text fields
checked_full = checked_full.dropna(subset=['text'])

#### Translating all text inputs to English

In [45]:
def translate(text):
  translated_text = GoogleTranslator(source='auto', target='en').translate(text)
  return translated_text if translated_text else ""

In [40]:
translator =  Translator()

def translate(text):
  return translator.translate(text,dest='en').text

In [28]:
PLACEHOLDER_KEYWORD = "PLACEHOLDER"

checked_full["translated"] = PLACEHOLDER_KEYWORD

In [29]:
checked_full.head()

Unnamed: 0,text,label,dataset,translated
0,【“渝”战愈勇！#1636位重庆援鄂医务人员全名单#】十八批救援队，1636个名字，#每个名...,real,checked,PLACEHOLDER
1,【#世卫组织称北京所现病毒与欧洲毒株密切相关#】当地时间19日，世卫组织卫生紧急项目技术主管...,real,checked,PLACEHOLDER
2,【#4分半回顾世界战疫时间线#】2019年底至今，一场来源不明的新冠疫情侵袭人类。全球战疫仍...,real,checked,PLACEHOLDER
3,【#你好，明天#】钟南山、李兰娟、陈薇……奋战在疫情防控一线的院士“天团”，一次次刷屏。他们...,real,checked,PLACEHOLDER
4,【江苏宿迁#46岁辅警抗疫战中牺牲#】2月27日晚，江苏宿迁市公安局交警支队三大队一中队辅警...,real,checked,PLACEHOLDER


In [46]:
for index, row in checked_full.iterrows():
    if row["translated"] != PLACEHOLDER_KEYWORD:
      continue

    original_value = row['text']

    try:
      translated_value = translate(original_value)

      checked_full.at[index, 'translated'] = translated_value
    except:
      continue

In [47]:
# check if any more rows still need to be transated
is_placeholder = checked_full["translated"].str.contains(PLACEHOLDER_KEYWORD)

# Count the occurrences by summing the True values
occurrences = is_placeholder.sum()
print(len(checked_full))
print(occurrences)

2104
0


In [48]:
# if needed, drop rows that haven't been able to be translated
checked_full_translated = checked_full[checked_full['translated'] != PLACEHOLDER_KEYWORD]

In [49]:
# replace "text" row with newly "translated" rows
checked_full_translated = checked_full_translated.drop('text', axis=1)

checked_full_translated = checked_full_translated.rename(columns={'translated': 'text'})

#checked_full_translated = checked_full_translated.drop('translated', axis=1)

KeyError: ignored

In [50]:
checked_full_translated.head()

Unnamed: 0,label,dataset,text
0,real,checked,"[""Yu"" fights bravely! #1636内重庆助湖医护人员全目列#】Eight..."
1,real,checked,[# WHO says the virus found in Beijing is clos...
2,real,checked,"【#4分半反句世界战疫病时线#】From the end of 2019 to now, a..."
3,real,checked,"[#你好，明天#] Zhong Nanshan, Li Lanjuan, Chen Wei...."
4,real,checked,[江苏Suqian#46岁内务防助助助力中奇起#] On the evening of Fe...


# Merging the datasets and perform new train/test split

In [72]:
full_df = merge_dfs([constraint_full, checked_full_translated])

In [73]:
train_df, test_df = train_test_split(full_df, train_size=SPLIT_TRAIN_FRAC)

In [74]:
print(len(train_df))
print(len(test_df))

8531
2133


In [75]:
train_df.isnull().sum()

text       0
label      0
dataset    0
dtype: int64

In [77]:
train_df.head()

Unnamed: 0,text,label,dataset
3137,#IndiaFightsCorona: The cumulative testing as ...,real,constraint
6111,##[Sorrow!#38 -year -old Hangzhou Police sacri...,real,checked
9433,"[#【【【【【【【【【#] June 18th to 24:00,#【【【【【【【【#【【【...",real,checked
2095,A female doctor in Uttar Pradesh died after be...,fake,constraint
7086,You can read more details on nationwide trends...,real,constraint


In [76]:
# save to file
# train_df.to_csv('full_train.csv', index=False)
# test_df.to_csv('full_test.csv', index=False)