In [1]:
import numpy as np
import csv as csv
from random import randint
import re
import pandas as pd
from sklearn.model_selection import train_test_split
import math

In [2]:
def csv2data(file_str):
    data_str = csv.reader(open(file_str, encoding = "ISO-8859-1"), delimiter='\n')
    data = []
    for row in data_str:
        data.append(row[0])    
    return data

corpus = csv2data('liste_francais.csv')

def clean_dict(dictionary):
    new_dict = []
    for word in dictionary:
        if((len(list(word))>=4) and (len(word.split(' '))==1) and re.match("^[a-zA-Z]*$", word) and re.match("^[^j|J|z|Z]*$", word)) and word[0].islower():
            new_dict.append(word)
    
    return new_dict

corpus = clean_dict(corpus)

In [3]:
len(corpus)

13489

In [17]:
def csv2images(file_str):
    data_str = csv.reader(open(file_str), delimiter='\n', quotechar='|')
    data = []
    next(data_str)
    for row in data_str:
        each_row = ','.join(row)
        row_arr = list(map(int, each_row.split(',')))
        data.append(row_arr)
    
    return data

train_images = csv2images('sign_mnist_train.csv')
test_images = csv2images('sign_mnist_test.csv')

In [25]:
def get_words_lists(dictionary, 
                         num_of_features=200, 
                         num_of_mutations=1, 
                         test_percentage=0.1, 
                         true_percentage=0.8): 
 
    train_word_list = []
    test_word_list = []
        
    n_of_misspellings = math.floor((1-true_percentage)*num_of_features)
    n_of_tests = math.floor(test_percentage*num_of_features)
    
    for word in dictionary:
        row = []
        for fea in range(n_of_misspellings):
            
            row.append(gen_misspellings(word, num_of_mutations))
        
        train_row, test_row = train_test_split(row, test_size=test_percentage)
        
        train_row.extend([word]*math.ceil((num_of_features - n_of_misspellings)*(1 - test_percentage)))
        test_row.extend([word]*math.ceil((num_of_features - n_of_misspellings)*(test_percentage)))       
        
        train_word_list.append(train_row)
        test_word_list.append(test_row)
        
    return train_word_list,test_word_list


def gen_misspellings(word, mut=1): 
    
    alphabet = ['a','b','c','d','e','f','g','h','i','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
    char_arr = list(word)
    
    for muts in range(mut):        
        mut_idx = randint(0, len(word)-1)
        char_arr[mut_idx] = alphabet[randint(0,23)] 

    return "".join(char_arr)

train_words, test_words = get_words_lists(corpus, 200, 1, 0.1, 0.2)

In [26]:
def create_buckets(images):
    
    buckets = [[] for _ in range(25)]

    for idx, sample in enumerate(images):
        buckets[int(sample[0])].append(idx)
        
    return buckets

train_buckets = create_buckets(train_images)
test_buckets = create_buckets(test_images)

In [27]:
def generate_LSTM_data(words, buckets):
    
    data = [];
    for sample in words:
        label = sample[-1]
        label_arr = [ord(char) - 96 for char in label.lower()]
        for word in sample:
            reference = [ord(char) - 96 for char in word.lower()]
            idx_arr = []
            for ele in reference:
                idx_arr.append(buckets[ele - 1][randint(0, len(buckets[ele - 1]) - 1)])
            data.append((word, reference, label, label_arr))

    return data

train_data = generate_LSTM_data(train_words, train_buckets)
test_data = generate_LSTM_data(test_words, test_buckets)

In [28]:
def split_indexes(full_data):
    
    item_1=[item[1] for item in full_data]
    idxs=([torch.LongTensor(xi) for xi in item_1])
    item_3=[item[3] for item in full_data]
    labels=([torch.LongTensor(xi) for xi in item_3])
    
    return idxs, labels

train_index, train_labels =split_indexes(train_data)
test_index, test_labels=split_indexes(test_data)
