In [None]:
from __future__ import print_function
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np 
import torchvision.transforms as transforms
import copy
import librosa
import soundfile as sf

In [None]:
#hyper parameter
N_FFT = 2048;

In [None]:

#stft공식을 적용하고 audio_data와 style_sr을 return 해주는 함수
def read_audio_spectum(filename):
	x, sr = librosa.load(filename, duration = 58.05) # Duration=58.05 so as to make sizes convenient
	S = librosa.stft(x, N_FFT)
	p = np.angle(S)
	S = np.log1p(np.abs(S))  
	return S, sr

In [None]:
style_audio, style_sr = read_audio_spectum('./alpha.wav')
content_audio, content_sr = read_audio_spectum('./beta.wav')
if(style_audio.shape == content_audio.shape):
  print('Sample Size Same!') 
  print('Shape : ', style_audio.shape)
  # 가져온 오디오 데이터의 크기가 같은지 확인
  # 만약 오디오 길이가 50초보다 짧으면 error 발생
else:
  print("Not same!")

Sample Size Same!
Shape :  (1025, 256)


In [None]:
style_audio = style_audio.reshape([1,1025,-1])
content_audio = content_audio.reshape([1,1025,-1])
#if style_audio type is tensor
#style_audio = style_audio.unsqueeze(dim = 0)
#content_audio = content_audio.unsqueeze(dim = 0)
print(style_audio.shape)
print(content_audio.shape)

(1, 1025, 256)
(1, 1025, 256)


In [None]:
#클래스 선언

class CNN(nn.Module):
  def __init__(self):
    super(CNN, self).__init__()
		#1d convolution을 사용하는 이유는 noise 때문에
    self.cnn1 = nn.Conv1d(in_channels=1025, out_channels=32, kernel_size=3, stride=1, padding=1)
  def forward(self, x):
			out = self.cnn1(x)
			out = out.view(out.size(0),-1)
			return out
		
class GramMatrix(nn.Module): #선형대수학에서 이용되는 matrix
	def forward(self, input): 
		a, b, c = input.size()#input을 3차원 numpy로 받음
		features = input.view(a * b, c)
	  #torch.mm은 두 matrix를 곱해줌
		G = torch.mm(features, features.t())
		return G.div(a * b * c)
	
class StyleLoss(nn.Module): 
	#style loss pytorch
	def __init__(self, target, weight):
		super(StyleLoss, self).__init__()
		self.target = target.detach() * weight 
		self.weight = weight
		self.gram = GramMatrix()
		self.criterion = nn.MSELoss()
	
	def forward(self, input):
		self.output = input.clone()
		self.G = self.gram(input)
		self.G.mul_(self.weight)
		self.loss = self.criterion(self.G, self.target)
		return self.output

	def backward(self,retain_graph=True):
		self.loss.backward(retain_graph=retain_graph)
		return self.loss


In [None]:
import copy
#get_style_model_ans_losses를 위한 parameters
style_weight=2500 #????
style_layer_default = 'conv_1'
def get_style_model_and_losses(cnn, style_float,style_weight=style_weight,style_layer = style_layer_default): #STYLE WEIGHT
  cnn = copy.deepcopy(cnn) #???? 왜 deep copy를 하는지 모르겠음
  style_losses = []
  #cnn이 nn.Sequential로 가정
  #순차적으로 활성화되어야 하는 모듈에 새로운 nn.Sequential을 만듭니다.
  model = nn.Sequential()  # the new Sequential module network
  gram = GramMatrix()  # we need a gram module in order to compute style targets
  if torch.cuda.is_available():
    model = model.cuda()
    gram = gram.cuda()
  name = 'conv_1'
  model.add_module(name, cnn.cnn1) #name cnn.cnn1 의 child 모듈 생성
  if name in style_layer:
    target_feature = model(style_float).clone()
    target_feature_gram = gram(target_feature)
    style_loss = StyleLoss(target_feature_gram, style_weight)
    model.add_module("style_loss_1", style_loss)
    style_losses.append(style_loss)
  return model, style_losses

In [None]:
cnn = CNN()
learning_rate_initial = 0.03
num_steps=100
if torch.cuda.is_available():
	cnn = cnn.cuda()

In [None]:
#adam optimizer 사용
def get_input_param_optimizer(input_float):
	input_param = nn.Parameter(input_float.data)
		#optimizer = optim.Adagrad([input_param], lr=learning_rate_initial, lr_decay=0.0001,weight_decay=0)
	optimizer = optim.Adam([input_param], lr=learning_rate_initial, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
	return input_param, optimizer

In [None]:
def run_style_transfer(cnn, style_float, input_float, num_steps=num_steps, style_weight=style_weight): #STYLE WEIGHT, NUM_STEPS
  print('Building the style transfer model..')
  #style_audio 에서 학습한 model을 들고온다
  model, style_losses= get_style_model_and_losses(cnn, style_float, style_weight) #model과 style loss를 return 받고
  input_param, optimizer = get_input_param_optimizer(input_float)# optimizer의 파라미터 설정
  print('Optimizing..')
  print(1)
  run = [0]
  #중간 결과 출력
  while run[0] <= num_steps:
    def closure():
           	# correct the values of updated input image
      input_param.data.clamp_(0, 1)
				#학습 하는 과정
      optimizer.zero_grad()
      model(input_param)
      style_score = 0

      for sl in style_losses:
					#print('sl is ',sl,' style loss is ',style_score)
        style_score += sl.backward()

      run[0] += 1
      if run[0] % 100 == 0:
        print("run {}:".format(run))
        print('Style Loss : {:8f}'.format(style_score.data)) #CHANGE 4->8 
        print()

      return style_score


  optimizer.step(closure)
  input_param.data.clamp_(0, 1)
  return input_param.data

In [None]:
from torch.autograd import Variable
import torch.optim as optim
if torch.cuda.is_available():
	style_float = Variable((torch.from_numpy(style_audio)).cuda())
	content_float = Variable((torch.from_numpy(content_audio)).cuda())	
else:
	style_float = Variable(torch.from_numpy(style_audio))
	content_float = Variable(torch.from_numpy(content_audio))
input_float = content_float.clone()
#main함수에서 실행
output = run_style_transfer(cnn, style_float, input_float)
if torch.cuda.is_available():
	output = output.cpu()
#===========================
#output = output.squeeze(0)
output = output.squeeze(0)  # 차원 확장
output = output.numpy()
#run_style_transfer에 있는 get_style_losses에서
#style을 학습한 cnn1을 학습
#optimizer를 torch로 했기때문에 
#output.numpy로 변환해준다
N_FFT=2048
a = np.zeros_like(output)
a = np.exp(output) - 1
	#지수함수로 변경 왜???
#======================
#output으로 출력해주는 함수
p = 2 * np.pi * np.random.random_sample(a.shape) - np.pi
for i in tqdm(range(500)):
	S = a * np.exp(1j*p)
	x = librosa.istft(S)
	p = np.angle(librosa.stft(x, N_FFT))
OUTPUT_FILENAME = 'output.wav'
sf.write(OUTPUT_FILENAME, x, style_sr)
print('DONE...')

Building the style transfer model..
Optimizing..
1


KeyboardInterrupt: ignored