In [1]:
!pip install unidecode

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |█▍                              | 10kB 17.8MB/s eta 0:00:01[K     |██▊                             | 20kB 2.2MB/s eta 0:00:01[K     |████▏                           | 30kB 3.2MB/s eta 0:00:01[K     |█████▌                          | 40kB 2.1MB/s eta 0:00:01[K     |██████▉                         | 51kB 2.6MB/s eta 0:00:01[K     |████████▎                       | 61kB 3.1MB/s eta 0:00:01[K     |█████████▋                      | 71kB 3.5MB/s eta 0:00:01[K     |███████████                     | 81kB 4.0MB/s eta 0:00:01[K     |████████████▍                   | 92kB 4.4MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 3.5MB/s eta 0:00:01[K     |███████████████▏                | 112kB 3.5MB/s eta 0:00:01[K     |████████████████▌               | 122kB 3.5MB/

# Libraries

In [0]:
import pandas as pd

import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
from collections import Counter
import os
from argparse import Namespace

import unidecode
import random

from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder

# Data preprocessing

In [0]:
reviews = pd.read_table('data/reviews.tsv')

In [0]:
reviews.dropna(subset=['content'], inplace=True)

In [0]:
def get_data_from_dataframe(df, batch_size, seq_size):
    
    text = " ".join(df.content.apply(unidecode.unidecode).values.flatten())
    
    text = text.split()

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text

In [0]:
def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]

## KNN

In [0]:
def column_label_encoding(df, le_colname):
  le = LabelEncoder()
  df[le_colname] = le.fit_transform(df[le_colname].to_list()) 
  return df

def mean_score_encoding(df, grouping_columns, target_columns):
  for target_column in target_columns:
    mean_group = df.groupby(grouping_columns)[target_column].mean().reset_index()
    mean_group.columns = grouping_columns + ['_'.join(grouping_columns) + '_mean_' + target_column]
    df = df.merge(mean_group, on=grouping_columns)
  return df

def convert_string_to_date(date_time_str):
  conversion = datetime.datetime.strptime(date_time_str, "%Y-%m-%d")
  return conversion

def change_date_for_column(df, column):
    return df[column].apply(convert_string_to_date)

def preprocess_knn(df):

  df = df[["artist", 
           "score", 
           "pub_date", 
           "best_new_music", 
           "genre", 
           "label", 
           "acousticness",
           "danceability",
           "energy",
           "instrumental",
           "liveness",
           "loudness",
           "speechiness",
           "tempo",
           "valence",
           "popularity"
  ]]
  df = column_label_encoding(df, 'label')
  df = mean_score_encoding(df, ['artist'], ['score'])
  df['pub_date'] = change_date_for_column(df, 'pub_date')
  df['pub_date'] = pd.to_numeric(df['pub_date'], errors='coerce')
  df = pd.get_dummies(df.drop(["artist"], axis=1))

  return df

In [0]:
def perform_KNN(df, n_neighbors):
    neighs = NearestNeighbors(n_neighbors=n_neighbors)
    neigs = neighs.fit(df)
    _, indices = neighs.kneighbors(reviews_knn)
        
    return indices

In [52]:
# Get nearest neighbors for each album
reviews_knn = preprocess_knn(reviews)
neighborhoods = perform_KNN(reviews_knn, 16)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
# Selecting a random album to generate a review
random_review_index = random.randint(0, reviews.shape[0])
test_album = reviews.iloc[random_review_index]
reviews = reviews.drop([random_review_index])

In [0]:
# Selecting reviews from nearest neighbors
reviews_from_cluster = reviews.iloc[neighborhoods[random_review_index]]

## RNN

In [0]:
class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size,
                            lstm_size,
                            batch_first=True)
        self.dense = nn.Linear(lstm_size, n_vocab)
        
    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state
        
    def zero_state(self, batch_size):
        return (torch.zeros(1, batch_size, self.lstm_size),
                torch.zeros(1, batch_size, self.lstm_size))

In [0]:
def get_loss_and_train_op(net, lr=0.001):
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(net.parameters(), lr=lr)

        return criterion, optimizer

In [0]:
flags = Namespace(
    seq_size=32,
    batch_size=32,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    initial_words=['This', 'album'],
    predict_top_k=5,
    checkpoint_path='checkpoint',
)

In [84]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [85]:
reviews_index_to_train = reviews_from_cluster.index.tolist() + random.sample(range(reviews.shape[0]), 256)

int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_dataframe(
    reviews.iloc[reviews_index_to_train], 
    flags.batch_size, 
    flags.seq_size
)

net = RNNModule(n_vocab, flags.seq_size, flags.embedding_size, flags.lstm_size)
net = net.to(device)

criterion, optimizer = get_loss_and_train_op(net, 0.05)

iteration = 0

Vocabulary size 36482


# Training

In [0]:
def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))
    
    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])
    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    print(' '.join(words))

In [0]:
n_epochs = 20
for e in range(n_epochs):
    batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
    state_h, state_c = net.zero_state(flags.batch_size)

    # Transfer data to GPU
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for x, y in batches:
        iteration += 1

        # Tell it we are in training mode
        net.train()

        # Reset all gradients
        optimizer.zero_grad()

        # Transfer data to GPU
        x = torch.tensor(x).to(device)
        y = torch.tensor(y).to(device)

        logits, (state_h, state_c) = net(x, (state_h, state_c))
        loss = criterion(logits.transpose(1, 2), y)

        state_h = state_h.detach()
        state_c = state_c.detach()

        loss_value = loss.item()

        # Perform back-propagation
        loss.backward()
        
        _ = torch.nn.utils.clip_grad_norm_(net.parameters(), flags.gradients_norm)

        # Update the network's parameters
        optimizer.step()

        if iteration % 100 == 0:
            print('Epoch: {}/{}'.format(e, n_epochs),
                  'Iteration: {}'.format(iteration),
                  'Loss: {}'.format(loss_value))

    predict(device, net, flags.initial_words, n_vocab,
                    vocab_to_int, int_to_vocab, top_k=5)

Epoch: 0/20 Iteration: 100 Loss: 8.372857093811035
This album that and a way of her first place. that and his first and a bit and and his own and the band to make its most hardcore first place. the band is a bit is a good that a lot in his music of a lot to a few of her and the same more personal Mind is an album and that a few and in the album the band and and and the way that that and and his music and and that the band that it to be the band of his own and his most Slugs in its
Epoch: 1/20 Iteration: 200 Loss: 7.616166591644287
