In [53]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import collections
# import csv

In [2]:
data = pd.read_csv('./snips/snips_train_actual.csv', header=None, delimiter='\t')

In [3]:
data.columns = ['labels', 'texts']

In [4]:
data['labels'].unique()

array(['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'RateBook',
       'SearchCreativeWork', 'SearchScreeningEvent', 'PlayMusic'],
      dtype=object)

In [5]:

data['labels'][0]

'AddToPlaylist'

In [6]:
label_map = {}
for i, label in enumerate(data['labels'].unique()):
    label_map[label] = i
    
label_map

{'AddToPlaylist': 0,
 'BookRestaurant': 1,
 'GetWeather': 2,
 'RateBook': 3,
 'SearchCreativeWork': 4,
 'SearchScreeningEvent': 5,
 'PlayMusic': 6}

In [7]:
data['classes'] = data['labels'].map(lambda x: label_map[x])

In [8]:
data

Unnamed: 0,labels,texts,classes
0,AddToPlaylist,add another song to the cita romántica playlist,0
1,AddToPlaylist,add clem burke in my playlist pre-party r&b jams,0
2,AddToPlaylist,add live from aragon ballroom to trapeo,0
3,AddToPlaylist,add unite and win to my night out,0
4,AddToPlaylist,add track to my digster future hits,0
...,...,...,...
13779,PlayMusic,play the most popular track from valery alexan...,6
13780,PlayMusic,play some good movement music by brian littrel...,6
13781,PlayMusic,play 2007 tunes by bunny berigan,6
13782,PlayMusic,play a ballad form 2014 by double on google music,6


In [9]:
data['labels'].value_counts()

PlayMusic               2000
GetWeather              2000
BookRestaurant          1973
SearchScreeningEvent    1959
RateBook                1956
SearchCreativeWork      1954
AddToPlaylist           1942
Name: labels, dtype: int64

In [10]:
np.random.choice(range(0, 2000), replace = False)

1965

In [12]:
y = data['classes']
X = data['texts']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0, stratify=y)

In [14]:
y_train.value_counts()

6    1900
2    1900
1    1874
5    1861
3    1858
4    1856
0    1845
Name: classes, dtype: int64

In [15]:
y_test.value_counts()

6    100
2    100
1     99
5     98
4     98
3     98
0     97
Name: classes, dtype: int64

In [21]:
y_train.to_numpy()

array([0, 4, 2, ..., 2, 0, 0])

In [22]:
# list(X_train)

In [45]:
class SnipsDataLoader():
    def __init__(self, train_path, valid_path=None, test_path=None):
        self.train_data = self.load_dataset(train_path)
        self.valid_data = self.load_dataset(valid_path) if valid_path != None else None
        self.test_data = self.load_dataset(test_path) if test_path != None else None
        
        self.create_label_mapping()
        
    def load_dataset(self, data_path):
        data = pd.read_csv(data_path, header=None, delimiter='\t')
        data.columns = ['labels', 'texts']
        output = {'X': data['texts'], 'y': data['labels']}
        return output
        
    def create_label_mapping(self):
        self.text_to_index_label_mapping = {}
        self.index_to_text_label_mapping = {}
        for i, label in enumerate(self.train_data['y'].unique()):
            self.text_to_index_label_mapping[label] = i
            self.index_to_text_label_mapping[i] = label
        
        self.train_data['y'] = \
            self.train_data['y'].map(lambda x: self.text_to_index_label_mapping[x])
        if self.valid_data:
            self.valid_data['y'] = \
                self.valid_data['y'].map(lambda x: self.text_to_index_label_mapping[x])
        if self.test_data:
            self.test_data['y'] = \
                self.test_data['y'].map(lambda x: self.text_to_index_label_mapping[x])
    
    def split_train_valid(self, valid_size, keep_class_ratios=True, random_state=0):
        X, y = self.train_data['X'], self.train_data['y']
        if keep_class_ratios:
            X_train, X_valid, y_train, y_valid = \
                train_test_split(X, y, test_size=valid_size, random_state=random_state, stratify=y)
        else:
            X_train, X_valid, y_train, y_valid = \
                train_test_split(X, y, test_size=valid_size, random_state=random_state)
            
        self.train_data = {'X': X_train, 'y': y_train}
        self.valid_data = {'X': X_valid, 'y': y_valid}
    
    def get_train_data(self):
        return list(self.train_data['X']), self.train_data['y'].to_numpy()
    
    def get_valid_data(self):
        return list(self.valid_data['X']), self.valid_data['y'].to_numpy()
    
    def get_test_data(self):
        return list(self.test_data['X']), self.test_data['y'].to_numpy()

In [46]:
TRAIN_PATH = './snips/snips_train_actual.csv'
VALID_PATH = './snips/snips_test_actual.csv'

In [47]:
data_loader = SnipsDataLoader(TRAIN_PATH, VALID_PATH)
data_loader.split_train_valid(valid_size=0.05, keep_class_ratios=True)

In [48]:
len(X_train) == len(data_loader.train_data['X'])

True

In [49]:
data_loader.valid_data['y'].value_counts()

6    100
2    100
1     99
5     98
4     98
3     98
0     97
Name: labels, dtype: int64

In [56]:
X_train, y_train = data_loader.get_train_data()
X_valid, y_valid = data_loader.get_valid_data()

In [61]:
class FeatureExtractor():
    def __init__(self, X_train, X_valid=None, X_test=None):
        self.X_train = X_train
        self.X_valid = X_valid
        self.X_test = X_test
    
    def extract_features(self, keep_words_threshold=5):
        self.keep_words_threshold = keep_words_threshold
        
        self.X_train = self.preprocess_data(self.X_train)
        if self.X_valid:
            self.X_valid = self.preprocess_data(self.X_valid)
        if self.X_test:
            self.X_test = self.preprocess_data(self.X_test)
        
        self.create_vocab(self.X_train)
        
        self.X_train = self.create_encodings(self.X_train)
        if self.X_valid:
            self.X_valid = self.create_encodings(self.X_valid)
        if self.X_test:
            self.X_test = self.create_encodings(self.X_test)
    
    def preprocess_data(self, text_data):
        output = []
        for example in text_data:
            words = [word.lower() for word in example.split()]
            output.append(words)
        return output
    
    def create_vocab(self, text_data):
        word_occurences = collections.defaultdict(int)
        for example in text_data:
            word_counts = self.get_word_counts(example)
            for word in word_counts.keys():
                word_occurences[word] += 1
        
        vocab_words = [word for word in sorted(word_occurences.keys()) 
                       if word_occurences[word] >= self.keep_words_threshold]
        self.vocab = {word: index for index, word in enumerate(vocab_words)}
        self.vocab_size = len(self.vocab)
        
    def create_encodings(self, text_data):
        num_examples = len(text_data)
        encodings = np.zeros((num_examples, self.vocab_size))
        
        for row, example in enumerate(text_data):
            word_counts = self.get_word_counts(example)
            for word, count in word_counts.items():
                if word in self.vocab:
                    col = self.vocab[word]
                    encodings[row, col] = count
                    
        return encodings
                    
    def get_word_counts(self, word_list):
        counts = collections.defaultdict(int)
        for word in word_list:
            counts[word] += 1
        return counts
    
    def get_train_encodings(self):
        return self.X_train
    
    def get_valid_encodings(self):
        return self.X_valid
    
    def get_test_encodings(self):
        return self.X_test

In [62]:
feature_extractor = FeatureExtractor(X_train, X_valid)
feature_extractor.extract_features(keep_words_threshold=5)
X_train = feature_extractor.get_train_encodings()
X_valid = feature_extractor.get_valid_encodings()

In [66]:
len(X_valid) == len(y_valid)

True

In [70]:
print(X_train.shape)

(13094, 1641)
