<a href="https://colab.research.google.com/github/alfrizzle/NLP-Projects/blob/master/KoBERT_nsmc_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pwd

/content


In [3]:
import os

os.chdir('/content/drive/MyDrive/KoBERT-nsmc')

In [4]:
# !git clone https://github.com/monologg/KoBERT-nsmc.git

In [5]:
!ls

data		   main.py	requirements.txt	utils.py
data_loader.py	   predict.py	sample_pred_in.txt
data_loader_v2.py  __pycache__	tokenization_kobert.py
LICENSE		   README.md	trainer.py


In [6]:
!pip install torch==1.4.0
!pip install transformers==2.10.0



In [7]:
import pandas as pd
import numpy as np

# Data Prep

In [None]:
# os.chdir('/content/drive/MyDrive/KoBERT-nsmc/data/')

In [11]:
fake_df = pd.read_csv('./data/misc/Fake.csv')
true_df = pd.read_csv('./data/misc/True.csv')

fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [12]:
fake_list = fake_df['text']
true_list = true_df['text']

fake_ds = pd.DataFrame({'text': fake_list, 'fake': np.zeros(len(fake_list), dtype='int64')}) # 0 == fake news 
true_ds = pd.DataFrame({'text': true_list, 'fake': np.ones(len(true_list), dtype='int64')}) # 1 == true news

raw_ds = fake_ds.append(true_ds, ignore_index=True)
# raw_ds = raw_ds.sample(frac=1).reset_index(drop=True)
raw_ds

Unnamed: 0,text,fake
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
44893,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
44894,"LONDON (Reuters) - LexisNexis, a provider of l...",1
44895,MINSK (Reuters) - In the shadow of disused Sov...,1
44896,MOSCOW (Reuters) - Vatican Secretary of State ...,1


In [13]:
df = raw_ds.rename(columns={'text': 'text', 'fake': 'label'})
df.head()

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [14]:
df.tail()

Unnamed: 0,text,label
44893,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
44894,"LONDON (Reuters) - LexisNexis, a provider of l...",1
44895,MINSK (Reuters) - In the shadow of disused Sov...,1
44896,MOSCOW (Reuters) - Vatican Secretary of State ...,1
44897,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,1


In [22]:
df['docid'] = range(1, len(df) + 1)
df.head()

Unnamed: 0,docid,text,label
0,1,Donald Trump just couldn t wish all Americans ...,0
1,2,House Intelligence Committee Chairman Devin Nu...,0
2,3,"On Friday, it was revealed that former Milwauk...",0
3,4,"On Christmas day, Donald Trump announced that ...",0
4,5,Pope Francis used his annual Christmas Day mes...,0


In [23]:
cols = ['docid', 'text', 'label']

In [24]:
df = df[cols]
df.head()

Unnamed: 0,docid,text,label
0,1,Donald Trump just couldn t wish all Americans ...,0
1,2,House Intelligence Committee Chairman Devin Nu...,0
2,3,"On Friday, it was revealed that former Milwauk...",0
3,4,"On Christmas day, Donald Trump announced that ...",0
4,5,Pope Francis used his annual Christmas Day mes...,0


In [55]:
df = pd.read_csv('/content/drive/MyDrive/KoBERT-nsmc/data/fake_news/dataset_clean.csv')

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35430 entries, 0 to 35429
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   docid   35430 non-null  object
 1   text    35430 non-null  object
 2   label   35415 non-null  object
dtypes: object(3)
memory usage: 830.5+ KB


In [57]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [58]:
print(train.shape)
print(test.shape)

(28344, 3)
(7086, 3)


In [59]:
train.to_csv('./data/fake_news/fn_train.csv', index = False)
test.to_csv('./data/fake_news/fn_test.csv', index = False)

In [29]:
# train = pd.read_csv('./data/fake_news/train.csv')
# test = pd.read_csv('./data/fake_news/test.csv')

In [37]:
import csv

full_file = '/content/drive/MyDrive/KoBERT-nsmc/data/misc/fake_news.csv'
train_file = './data/fake_news/fn_train.csv'
test_file = './data/fake_news/fn_test.csv'

# with open(test_file, 'r') as inp, open('./data/fake_news/fn_test.txt', 'w') as out:
#     for line in inp:
#         line = line.replace(',', '\t')
#         out.write(line)

csv.writer(open('./data/fake_news/fn_train.txt', 'w+'), delimiter='\t').writerows(csv.reader(open(train_file)))

# Test Read Dataset Function from data_loader.py

In [38]:
def read_file(input_file, quotechar=None):
    """Reads a tab separated value file."""
    with open(input_file, "r", encoding="utf-8") as f:
        lines = []
        for line in f:
            lines.append(line.strip())
        return lines

In [None]:
read_file('/content/drive/MyDrive/KoBERT-nsmc/data/fake_news/fn_train_cut.csv')

In [72]:
df_small = df_train[:30000]
df_small.to_csv('./data/fake_news/fn_small.csv', index=False)

# Model Training

In [51]:
!pwd

/content/drive/MyDrive/KoBERT-nsmc


In [79]:
!python main.py --data_dir ./data/ --train_file ratings_train.txt --test_file ratings_test.txt --model_type bert --max_seq_len 128 --do_train --do_eval --num_train_epochs 2

12/17/2021 07:22:52 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt from cache at /root/.cache/torch/transformers/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729
12/17/2021 07:22:53 - INFO - data_loader -   Creating features from dataset file at ./data/
12/17/2021 07:22:53 - INFO - data_loader -   LOOKING AT ./data/ratings_train.txt
12/17/2021 07:22:53 - INFO - data_loader -   ['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0']
12/17/2021 07:22:53 - INFO - data_loader -   ['9856453', '정말 최고의 명작 성인이 되고 본 이집트의 왕자는 또 다른 감동 그자체네요', '1']
12/17/2021 07:22:53 - INFO - data_loader -   ['10252730', '최최최최악악악악 !!!!', '0']
12/17/2021 07:22:53 - INFO - data_loader -   ['8291487', '완전 재미있음 추천 ㅠㅠ♥ 생텀 화이팅 !!!♥', '1']
12/17/2021 07:22:53 - INFO - data_loader -   ['2303393', '정말 재밌는 영화!! ost 또한 좋다~!!', '1']
12/17/2021 07:22:53 - INFO -

In [None]:
!python main.py --data_dir ./data/fake_news --train_file dataset_clean.csv --test_file fn_test.csv --model_type bert --max_seq_len 128 --do_train --do_eval --num_train_epochs 2