In [1]:
import numpy as np
import pandas as pd
from functools import reduce, partial
from pathlib import Path

In [60]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

class TextSequence:
    '''
    テキストシーケンスのクラス
    1行のテキストを文字単位に分割し、文字IDのベクトルに変換する
    また、CSVの入出力メソッドを持つ
    '''
    def __init__(self, t_prefix='_'):
        '''
        Parameters
        ----------
        t_prefix : str
            教師データの開始文字
            推論のときは、この文字から推論を始め、
            出力された文字を次の推論に用い……を繰り返す
        '''
        self._init_vocab()
        self.vec_x = None
        self.vec_t = None
        if not isinstance(t_prefix, str) or len(t_prefix) != 1:
            print("Argument 't_prefix' is unexpected, so using '_' as 't_prefix'.")
            t_prefix = '_'
        self.__t_prefix = t_prefix
    
    @property
    def t_prefix(self):
        return self.__t_prefix
    
    @property
    def vocab(self):
        return self.char_to_id, self.id_to_char
    
    @property
    def raw_x(self):
        '''
        文字列はデータとして持たず、ベクトル表現から変換して返す
        '''
        return self.vector_to_sequence(self.vec_x)
    
    @property
    def raw_t(self):
        '''
        文字列はデータとして持たず、ベクトル表現から変換して返す
        '''
        return self.vector_to_sequence(self.vec_t)
    
    def _init_vocab(self):
        self.char_to_id = {}
        self.id_to_char = {}
        self._update_vocab(' ')  # 空白をid=0で登録
    
    def _update_vocab(self, text):
        '''
        ボキャブラリ辞書の更新

        Parameters
        ----------
        text : str
            読み込む文字列
        '''
        for char in text:
            if char not in self.char_to_id:
                i = len(self.char_to_id)
                self.char_to_id[char] = i
                self.id_to_char[i] = char
    
            
    def read_csv(self, source_csv, col_x='x', col_t='y'):
        '''
        CSVを読み込み、文字列を文字IDベクトルに変換してデータセットとして保持する
        CSV format:
            x : 入力文字列
            y : 教師文字列

        Parameters
        ----------
        source_csv : str or pathlib.Path
            読み込むCSVファイルのパス
        col_x : str, optional
            入力文字列のカラム名
        col_t : str, optional
            教師文字列のカラム名

        Returns
        -------
        vec_x, vec_t : np.ndarray
            文字IDベクトル
            メンバとしても持つ
        '''
        # read CSV, split inputs and teachers
        # 教師データの文字列には先頭に開始文字t_prefixを付ける
        df = pd.read_csv(source_csv)
        raw_x = [str(x) for x in df[col_x].values]
        raw_t = [self.t_prefix + str(t) for t in df[col_t].values]
        
        # create vocab dict
        # ついでに文字IDベクトルの次元数(=文字列の最大長さ)を取得する
        self._init_vocab()
        dim_x = dim_t = 0
        for x, t in zip(raw_x, raw_t):
            self._update_vocab(x)
            self._update_vocab(t)
            dim_x = len(x) if len(x) > dim_x else dim_x
            dim_t = len(t) if len(t) > dim_t else dim_t
        
        # convert from string to vector
        nb_data = len(df)
        vec_x = np.zeros((nb_data, dim_x), dtype=np.int)
        vec_t = np.zeros((nb_data, dim_t), dtype=np.int)
        for i, (x, t) in enumerate(zip(raw_x, raw_t)):
            vec_x[i] = [self.char_to_id[char] for char in x.ljust(dim_x, ' ')]
            vec_t[i] = [self.char_to_id[char] for char in t.ljust(dim_t, ' ')]
        
        self.vec_x = vec_x
        self.vec_t = vec_t
        
        return vec_x, vec_t
    
    def to_csv(self, target_csv, x=None, t=None, col_x='x', col_t='y'):
        '''
        データセットをCSVに保存する
        CSV format:
            x__i : vec_x[i]
            y__i : vec_t[i]

        Parameters
        ----------
        target_csv : str or pathlib.Path
            保存するCSVファイルのパス
        x : np.ndarray, optional
            入力データ
            指定しない場合はself.x_vecを用いる
        t : np.ndarray, optional
            教師データ
            指定しない場合はself.t_vecを用いる
        col_x : str, optional
            入力データのカラム名prefix
        col_t : str, optional
            教師データのカラム名prefix

        Returns
        -------
        pd.DataFrame
            保存するCSVと同じフォーマットのDataFrame
        '''
        if not isinstance(x, np.ndarray) or x.ndim != 2:
            x = self.vec_x
        if not isinstance(t, np.ndarray) or t.ndim != 2:
            t = self.vec_t
        
        columns_x = [col_x + '__' + str(i) for i in range(x.shape[1])]
        columns_t = [col_t + '__' + str(i) for i in range(t.shape[1])]
        
        df = pd.DataFrame(x, columns=columns_x).join(pd.DataFrame(t, columns=columns_t))
        df.to_csv(target_csv, index=False)
        
        return df
    
    def split_data(self, x=None, t=None, test_size=0.2, shuffle=True, seed=None):
        '''
        sklearn.model_selection.train_test_split()を呼び出して
        データセットを訓練データとテストデータに分割するヘルパー関数

        Returns
        ----------
        (np.ndarray, np.ndarray, np.ndarray, np.ndarray)
            x_train, x_test, t_train, t_test
        '''
        if not isinstance(x, np.ndarray) or x.ndim != 2:
            x = self.vec_x
        if not isinstance(t, np.ndarray) or t.ndim != 2:
            t = self.vec_t
        
        return train_test_split(x, t, test_size=test_size, random_state=seed, shuffle=shuffle)
    
    def result_to_csv(self, target_csv, vec_x, vec_t, vec_guess):
        # 文字列へ変換
        xs = self.vector_to_sequence(vec_x)
        ts = self.vector_to_sequence(vec_t)
        gs = self.vector_to_sequence(vec_guess)
        # 正解判定
        correct = [1 if t == guess else 0 for t, guess in zip(ts, gs)]
        
        accuracy = sum(correct) / len(correct)
        print("Accuracy:", accuracy)
        
        df = pd.DataFrame({'x': xs, 'y': ts, 'guess': gs, 'correct': correct})
        df.to_csv(target_csv, index=False)
        
        return df
    
    def vector_to_sequence(self, xs):
        '''
        Parameters
        ----------
        xs : np.ndarray  (ndim == 2)
            文字IDベクトルで表現された文字列データ
        Returns
        ----------
        list
            文字列のリスト
        '''
        if not isinstance(xs, np.ndarray) or xs.ndim != 2:
            raise ValueError('Argument "xs" is not word vector.')
        # 空白と開始文字('_')を除いてベクトル表現を文字に変換し、行ごとに連結して文字列のリストを返す
        return [''.join([id_to_char[x]\
                         for x in row if not id_to_char[x] in (' '+self.__t_prefix)])\
                for row in xs]

In [4]:
dataset_dir = Path("./dataset")

In [5]:
source_csv = dataset_dir / 'addition-test.csv'

In [61]:
seq = TextSequence()

In [70]:
x, t = seq.read_csv(source_csv)

In [71]:
seq.vocab

({' ': 0,
  '1': 1,
  '6': 2,
  '+': 3,
  '7': 4,
  '5': 5,
  '_': 6,
  '9': 7,
  '2': 8,
  '0': 9,
  '3': 10,
  '8': 11,
  '4': 12},
 {0: ' ',
  1: '1',
  2: '6',
  3: '+',
  4: '7',
  5: '5',
  6: '_',
  7: '9',
  8: '2',
  9: '0',
  10: '3',
  11: '8',
  12: '4'})

In [64]:
seq.raw_x

['16+75',
 '52+607',
 '75+22',
 '63+22',
 '795+3',
 '706+796',
 '8+4',
 '84+317',
 '9+3',
 '6+2']

In [58]:
x_train, x_test, t_train, t_test = seq.split_data(seed=1)

In [59]:
seq.to_csv(dataset_dir / "train.csv", x_train, t_train)

Unnamed: 0,x__0,x__1,x__2,x__3,x__4,x__5,x__6,y__0,y__1,y__2,y__3,y__4
0,11,3,12,0,0,0,0,6,1,8,0,0
1,4,7,5,3,10,0,0,6,4,7,11,0
2,1,2,3,4,5,0,0,6,7,1,0,0
3,2,10,3,8,8,0,0,6,11,5,0,0
4,5,8,3,2,9,4,0,6,2,5,7,0
5,11,12,3,10,1,4,0,6,12,9,1,0
6,7,3,10,0,0,0,0,6,1,8,0,0
7,4,9,2,3,4,7,2,6,1,5,9,8


In [68]:
seq.read_csv(dataset_dir / "date-test.csv")

(array([[ 1,  2,  3,  4,  2,  5,  6,  2,  7,  0,  8,  9, 10,  0, 11, 12,
         12, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [17, 18, 19, 18,  1,  4,  0, 11, 12, 10,  0,  8, 16, 16, 20,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 8, 22, 11, 16, 22, 12, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [11, 16, 22, 20, 11, 22, 12, 16,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [23, 24, 25, 26, 27, 17, 28, 10,  0, 26, 25, 29, 23, 25, 30, 31,
         25, 32,  0,  8, 33, 10,  0, 11, 12, 21, 13],
        [34, 24, 35,  0, 11,  9, 10,  0,  8, 16, 11, 20,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [37,  3,  7, 38, 39,  0, 20, 10,  0, 11, 12, 12, 36,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [40, 41,  4, 42,  6,  2,  7,  0,  8, 13, 10,  0, 11, 12,  9, 13,
          0,  0,  0,  0,  0,  0,  0,  

In [69]:
seq.vec_x

array([[ 1,  2,  3,  4,  2,  5,  6,  2,  7,  0,  8,  9, 10,  0, 11, 12,
        12, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [17, 18, 19, 18,  1,  4,  0, 11, 12, 10,  0,  8, 16, 16, 20,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 8, 22, 11, 16, 22, 12, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [11, 16, 22, 20, 11, 22, 12, 16,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [23, 24, 25, 26, 27, 17, 28, 10,  0, 26, 25, 29, 23, 25, 30, 31,
        25, 32,  0,  8, 33, 10,  0, 11, 12, 21, 13],
       [34, 24, 35,  0, 11,  9, 10,  0,  8, 16, 11, 20,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [37,  3,  7, 38, 39,  0, 20, 10,  0, 11, 12, 12, 36,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [40, 41,  4, 42,  6,  2,  7,  0,  8, 13, 10,  0, 11, 12,  9, 13,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],


In [63]:
seq.raw_x

['september 27, 1994',
 'August 19, 2003',
 '2/10/93',
 '10/31/90',
 'TUESDAY, SEPTEMBER 25, 1984',
 'JUN 17, 2013',
 'april 3, 1996',
 'October 24, 1974',
 'AUGUST 11, 1986',
 'February 16, 2015']

In [9]:
char_to_id, id_to_char = seq.vocab

In [32]:
df = pd.DataFrame(t)

In [33]:
df.applymap(lambda x: id_to_char[x])

Unnamed: 0,0,1,2,3,4
0,_,9,1.0,,
1,_,6,5.0,9.0,
2,_,9,7.0,,
3,_,8,5.0,,
4,_,7,9.0,8.0,
5,_,1,5.0,0.0,2.0
6,_,1,2.0,,
7,_,4,0.0,1.0,
8,_,1,2.0,,
9,_,8,,,


In [37]:
[''.join([id_to_char[x] for x in line]).replace(' ', '').replace('_', '') for line in t]

['91', '659', '97', '85', '798', '1502', '12', '401', '12', '8']

In [65]:
csv_path= dataset_dir / 'test_result.csv'

In [66]:
seq.result_to_csv(csv_path, x, t, t)

Unnamed: 0,x,y,guess,correct
0,16+75,91,91,1
1,52+607,659,659,1
2,75+22,97,97,1
3,63+22,85,85,1
4,795+3,798,798,1
5,706+796,1502,1502,1
6,8+4,12,12,1
7,84+317,401,401,1
8,9+3,12,12,1
9,6+2,8,8,1


In [77]:
guess = t.copy()

In [78]:
guess[[1,2,3], [4, 0, 1]] = 0

In [79]:
guess

array([[ 6,  7,  1,  0,  0],
       [ 6,  2,  5,  7,  0],
       [ 0,  7,  4,  0,  0],
       [ 6,  0,  5,  0,  0],
       [ 6,  4,  7, 11,  0],
       [ 6,  1,  5,  9,  8],
       [ 6,  1,  8,  0,  0],
       [ 6, 12,  9,  1,  0],
       [ 6,  1,  8,  0,  0],
       [ 6, 11,  0,  0,  0]])

In [85]:
reduce(np.logical_and, (t == guess).T).mean()

0.8

In [None]:
a