In [14]:
from PIL import Image
import numpy as np
import os
import glob
import numpy as np
import pandas as pd

import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

import openpyxl
from tqdm import tqdm


In [24]:
# 각 섹션별 이미지가 담겨 있는 폴더의 경로들을 가져옴

section_folders = glob.glob('/home2/jh981017/myubai/NaverNews/*')
section_folders

['/home2/jh981017/myubai/NaverNews/economy',
 '/home2/jh981017/myubai/NaverNews/life',
 '/home2/jh981017/myubai/NaverNews/politics',
 '/home2/jh981017/myubai/NaverNews/science',
 '/home2/jh981017/myubai/NaverNews/society',
 '/home2/jh981017/myubai/NaverNews/world']

In [25]:
root = 'NaverNews'
sections = os.listdir(root)
sections

['economy', 'life', 'politics', 'science', 'society', 'world']

In [26]:
text_paths = []
for folder, section in zip(section_folders, sections):
  text_path = folder + '/' + section + 'text1' + '.xlsx'
  text_paths.append(text_path)

text_paths

['/home2/jh981017/myubai/NaverNews/economy/economytext1.xlsx',
 '/home2/jh981017/myubai/NaverNews/life/lifetext1.xlsx',
 '/home2/jh981017/myubai/NaverNews/politics/politicstext1.xlsx',
 '/home2/jh981017/myubai/NaverNews/science/sciencetext1.xlsx',
 '/home2/jh981017/myubai/NaverNews/society/societytext1.xlsx',
 '/home2/jh981017/myubai/NaverNews/world/worldtext1.xlsx']

In [27]:
# 각 섹션별 사용할 데이터의 인덱스가 담겨 있는 딕셔너리

idx_dictionary = {}

for section, text_path in zip(sections, text_paths):
  text = pd.read_excel(text_path)
  idx_section = list(text['idx'])

  idx_dictionary[section] = idx_section

In [28]:
len(idx_dictionary['economy'])

1888

In [29]:
np.random.seed(602)

cv_idx_dictionary = {}

for section in sections:
  cv_idx_section = list(np.random.choice(idx_dictionary[section], size = 1200, replace = False))
  cv_idx_section.sort()

  cv_idx_dictionary[section] = cv_idx_section

In [30]:
len(cv_idx_dictionary['economy'])

1200

In [31]:
new_idx_dictionary = {}

for section in sections:
  new_idx_section = [i for i in idx_dictionary[section] if i not in cv_idx_dictionary[section]]

  new_idx_dictionary[section] = new_idx_section

In [32]:
len(new_idx_dictionary['economy'])

688

In [33]:
len(idx_dictionary['economy'])

1888

In [None]:
# 라벨에 해당하는 y값 매칭하기 (0 ~ 5)

# label_to_y = {section : idx for idx, section in enumerate(sections)}
# label_to_y

In [34]:
label_to_y = {
    'politics': 0,
    'society': 1,
    'science': 2,
    'life': 3,
    'world': 4,
    'economy': 5
}
label_to_y

{'politics': 0,
 'society': 1,
 'science': 2,
 'life': 3,
 'world': 4,
 'economy': 5}

In [35]:
# 분석에 사용할 모든 데이터들의 경로를 불러온다.

cv_data = []
new_data = []

for section_folder in section_folders:

  # 각 섹션 이름 가져와서 인덱스랑 합하기
  section = os.path.basename(section_folder)

  cv_indicies = cv_idx_dictionary[section]
  new_indicies = new_idx_dictionary[section]

  y = label_to_y[section]


  for cv_idx in cv_indicies:
    imgname = section + str(cv_idx) + '.jpg'
    imgpath = os.path.join(section_folder, imgname)

    data = []
    data.append(cv_idx)
    data.append(imgpath)
    data.append(y)

    cv_data.append(data)


  for new_idx in new_idx_dictionary[section]:
    imgname = section + str(new_idx) + '.jpg'
    imgpath = os.path.join(section_folder, imgname)

    data = []
    data.append(cv_idx)
    data.append(imgpath)
    data.append(y)

    new_data.append(data)

In [36]:
cv_data[0:10]

[[3, '/home2/jh981017/myubai/NaverNews/economy/economy3.jpg', 5],
 [6, '/home2/jh981017/myubai/NaverNews/economy/economy6.jpg', 5],
 [9, '/home2/jh981017/myubai/NaverNews/economy/economy9.jpg', 5],
 [13, '/home2/jh981017/myubai/NaverNews/economy/economy13.jpg', 5],
 [16, '/home2/jh981017/myubai/NaverNews/economy/economy16.jpg', 5],
 [17, '/home2/jh981017/myubai/NaverNews/economy/economy17.jpg', 5],
 [18, '/home2/jh981017/myubai/NaverNews/economy/economy18.jpg', 5],
 [22, '/home2/jh981017/myubai/NaverNews/economy/economy22.jpg', 5],
 [23, '/home2/jh981017/myubai/NaverNews/economy/economy23.jpg', 5],
 [25, '/home2/jh981017/myubai/NaverNews/economy/economy25.jpg', 5]]

In [37]:
len(cv_data)

7200

In [38]:
len(new_data)

5720

In [39]:
# 이미지 데이터셋 만들기

class RoBaMFImageDataset(Dataset):
  def __init__(self, dataset, img_idx = 1, label_idx = 2, transform = None):
    super(RoBaMFImageDataset, self).__init__()

    self.imgpaths = [i[img_idx] for i in dataset]
    self.y = [np.int32(i[label_idx]) for i in dataset]
    self.transform = transform


  def __len__(self):

    return len(self.imgpaths)


  def __getitem__(self, idx):
    imgpath = self.imgpaths[idx]

    img = Image.open(imgpath).convert('RGB')
    img = self.transform(img)

    target = self.y[idx]

    return img, target

In [21]:
# 사용할 모형에 맞게 이미지를 transform & normalize

train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    #transforms.Resize(256),
    #transforms.CenterCrop((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    #transforms.Resize(256),
    #transforms.CenterCrop((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [32]:
class ImageModel(nn.Module):
  def __init__(self, mobilenetv2):
    super(ImageModel, self).__init__()

    self.mobilenetv2 = mobilenetv2

    self.fc = nn.Sequential(
              nn.Linear(1000, 1024),
              nn.ReLU(),
              nn.Linear(1024, 1024),
              nn.ReLU(),
              nn.Linear(1024, 6),
              nn.Softmax(1)
              )


  def forward(self, img):
    x = self.mobilenetv2(img)
    x = self.fc(x)

    return x

In [23]:
# 모형의 가중치 업데이트를 위한 모듈

def model_train(model, data_loader, loss_fn, optimizer, device):

  model.train()
  size = len(data_loader.dataset)

  progress_bar = tqdm(data_loader)

  corr = 0
  running_loss = 0

  for X, y in progress_bar:
    X, y = X.to(device), y.long().to(device)

    # 예측하고 크로스엔트로피 계산
    pred = model(X)
    loss = loss_fn(pred, y)

    # 그래티언트 초기화
    optimizer.zero_grad()

    # 역전파 알고리즘에 의한 그래디언트 계산
    loss.backward()

    # 그래디언트를 이용한 업데이트
    optimizer.step()

    # accuracy 계산을 위한 정답 개수 계산
    corr += (pred.argmax(1) == y).type(torch.float).sum().item()

    # 평균 크로스엔트로피 계산을 위한 합
    running_loss += loss.item() * X.size(0)


  # accuracy
  accuracy = corr / size
  running_loss = running_loss / size

  return accuracy, running_loss

In [24]:
# 모형 평가를 위한 모듈

def model_evaluate(model, data_loader, loss_fn, device):

  size = len(data_loader.dataset)
  model.eval()

  with torch.no_grad():
    corr = 0
    running_loss = 0

    for X, y in data_loader:
      X, y = X.to(device), y.long().to(device)


      # 예측 확률 계산
      pred = model(X)
      loss = loss_fn(pred, y)

      # accuracy 계산을 위한 정답 개수 계산
      corr += (pred.argmax(1) == y).type(torch.float).sum().item()

      # 평균 크로스엔트로피 계산을 위한 합
      running_loss += loss.item() * X.size(0)

  # accuracy
  accuracy = corr / size
  running_loss = running_loss / size

  return accuracy, running_loss

Stratified K-fold CV

In [None]:
#lr=0.2, patence=3, factor=0.3/ adamw, rmsprop/ lr한번 조절해보자!/loss fuction도..? 0.25앵간했음
#RBGA로 안나타나는 png파일을
#from PIL import Image

#image = Image.open('path_to_your_image.png')
#image = image.convert('RGBA')
#위 방식처럼 rgba방식으로 표현가능

In [25]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 602)

In [26]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [27]:
df_cv_data = pd.DataFrame(cv_data)
df_new_data = pd.DataFrame(new_data)

In [28]:
list_train_idx = []
list_test_idx = []

for train_idx, test_idx in cv.split(df_cv_data[1], df_cv_data[2]):
  list_train_idx.append(train_idx)
  list_test_idx.append(test_idx)

In [None]:
max_epoch = 20   ### 에포크 수정 ###

fold = 0

list_test_history = []

for train_idx, test_idx in zip(list_train_idx, list_test_idx):
  fold += 1


  # 모델 초기화
  mobilenetv2 = models.mobilenet_v2(pretrained = 'IMAGENET1K_V2')
  model = ImageModel(mobilenetv2)

  if torch.cuda.is_available():
    model.cuda()


  train_data = [cv_data[i] for i in train_idx]
  test_data = [cv_data[i] for i in test_idx]

  train_dataset = RoBaMFImageDataset(dataset = train_data, img_idx = 1, label_idx = 2, transform = train_transform)
  test_dataset = RoBaMFImageDataset(dataset = test_data, img_idx = 1, label_idx = 2, transform = test_transform)

  train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)
  test_loader = DataLoader(test_dataset, batch_size = 16, shuffle = True)


  optimizer = optim.SGD(model.parameters(), lr = 1.66 * 1e-3)
  scheduler = ReduceLROnPlateau(optimizer, mode = 'min', patience = 3, factor = 0.3)
  loss_fn = nn.CrossEntropyLoss()


  test_history = []
  for epoch in range(max_epoch):

    # 각 에포크별 모형 훈련 -> train accuracy와 손실함수 반환
    train_accuracy, train_loss = model_train(model, train_loader, loss_fn, optimizer, device)

    # 시험 데이터에 모형 적합 -> valiation accuray와 손실함수 반환
    val_accuracy, val_loss = model_evaluate(model, test_loader, loss_fn, device)

    scheduler.step(train_loss)

    test_history.append(val_accuracy)

    print(f'''fold {fold:d},  epoch {epoch + 1:02d} -------------------------------------------------- \n
            train_accuracy: {train_accuracy:.5f}, train_loss: {train_loss:.5f}, val_accuracy: {val_accuracy:.5f}, val_loss: {val_loss:.5f} \n\n''')

  print(f'''=========================================================================== \n
              fold {fold:d}  Ended.  \n
              =========================================================================== \n ''')


  # 모형 가중치 저장
  torch.save(model.state_dict(), f'Machine Learning/Model Weights/ImageWeight{fold}.pth')

  list_test_history.append(test_history)


100%|██████████| 360/360 [01:51<00:00,  3.23it/s]


fold 1,  epoch 01 -------------------------------------------------- 

            train_accuracy: 0.20885, train_loss: 1.78785, val_accuracy: 0.27153, val_loss: 1.78062 




100%|██████████| 360/360 [01:56<00:00,  3.08it/s]


fold 1,  epoch 02 -------------------------------------------------- 

            train_accuracy: 0.28611, train_loss: 1.77270, val_accuracy: 0.26875, val_loss: 1.76238 




100%|██████████| 360/360 [01:57<00:00,  3.08it/s]


fold 1,  epoch 03 -------------------------------------------------- 

            train_accuracy: 0.29288, train_loss: 1.74775, val_accuracy: 0.26944, val_loss: 1.73940 




100%|██████████| 360/360 [01:35<00:00,  3.75it/s]


fold 1,  epoch 04 -------------------------------------------------- 

            train_accuracy: 0.29896, train_loss: 1.72441, val_accuracy: 0.28958, val_loss: 1.72306 




100%|██████████| 360/360 [01:28<00:00,  4.05it/s]


fold 1,  epoch 05 -------------------------------------------------- 

            train_accuracy: 0.32674, train_loss: 1.70813, val_accuracy: 0.30764, val_loss: 1.70959 




100%|██████████| 360/360 [01:29<00:00,  4.03it/s]


fold 1,  epoch 06 -------------------------------------------------- 

            train_accuracy: 0.33802, train_loss: 1.69198, val_accuracy: 0.31806, val_loss: 1.69901 




100%|██████████| 360/360 [01:31<00:00,  3.94it/s]


fold 1,  epoch 07 -------------------------------------------------- 

            train_accuracy: 0.36354, train_loss: 1.67356, val_accuracy: 0.33958, val_loss: 1.68526 




100%|██████████| 360/360 [01:35<00:00,  3.75it/s]


fold 1,  epoch 08 -------------------------------------------------- 

            train_accuracy: 0.37760, train_loss: 1.66141, val_accuracy: 0.35764, val_loss: 1.67047 




 94%|█████████▎| 337/360 [02:01<00:09,  2.42it/s]

In [None]:
df_test_history = pd.DataFrame(list_test_history)
df_test_history.to_csv('Machine Learning/Baseline CVs/Image Baseline.csv')

New Data Prediction

In [55]:
new_data[1]

['/content/drive/MyDrive/NaverNews/politics/politics9.jpg', 0]

In [65]:
max_epoch = 5   ### 에포크 수정 ###

# 모델 초기화
mobilenetv2 = models.mobilenet_v2(weights = 'DEFAULT')
model = ImageModel(mobilenetv2)

for i, (name, param) in enumerate(model.named_parameters()):
  param.requires_grad = False
  if i == 155:
    break

if torch.cuda.is_available():
  model.cuda()


train_data = cv_data
test_data = new_data

train_dataset = RoBaMFImageDataset(dataset = train_data, img_idx = 1, label_idx = 2, transform = train_transform)
test_dataset = RoBaMFImageDataset(dataset = test_data, img_idx = 1, label_idx = 2, transform = test_transform)

train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 16, shuffle = True)


optimizer = optim.Adam(model.parameters(), lr = 0.2)
scheduler = ReduceLROnPlateau(optimizer, mode = 'min', patience = 3, factor = 0.3)
loss_fn = nn.CrossEntropyLoss()


test_history = []
for epoch in range(max_epoch):

  # 각 에포크별 모형 훈련 -> train accuracy와 손실함수 반환
  train_accuracy, train_loss = model_train(model, train_loader, loss_fn, optimizer, device)

  # 시험 데이터에 모형 적합 -> valiation accuray와 손실함수 반환
  val_accuracy, val_loss = model_evaluate(model, test_loader, loss_fn, device)

  scheduler.step(train_loss)

  test_history.append(val_accuracy)

  print(f'''epoch {epoch + 1:02d} -------------------------------------------------- \n
          train_accuracy: {train_accuracy:.5f}, train_loss: {train_loss:.5f}, val_accuracy: {val_accuracy:.5f}, val_loss: {val_loss:.5f} \n\n''')



  0%|          | 0/450 [00:02<?, ?it/s]


FileNotFoundError: ignored