In [44]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing import sequence
df = pd.read_csv('features_data.csv')

In [45]:
NGRAMS = (2, 3)
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=10, ngram_range=NGRAMS, lowercase=False) 
a = vect.fit_transform(df.name)
vocab = vect.vocabulary_
words = []
for b in vocab:
    c = vocab[b]
    words.append((a[:, c].sum(), b))
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]

In [46]:
import pickle
with open('words_list.pkl', 'wb') as f:
    pickle.dump(words_list, f)

In [47]:
def n_grams(tokens, n=1):
    """
    Generate n-grams from a sequence of tokens by breaking down the process into individual steps.

    Args:
        tokens (list): The list of tokens from which to generate n-grams.
        n (int): The size of each n-gram.

    Returns:
        list: A list containing n-grams as tuples.
    """
    n_grams_list = []  
    total_n_grams = len(tokens) - n + 1
    for i in range(total_n_grams):
        n_gram = tuple(tokens[i:i + n])  
        n_grams_list.append(n_gram)  
    return n_grams_list

def range_ngrams(tokens, ngram_range=(1, 2)):
    """
    Generate all n-grams for each 'n' within the specified range from a list of tokens.

    Args:
        tokens (list): The list of tokens from which to generate n-grams.
        ngram_range (tuple): A tuple specifying the minimum and maximum n-gram sizes.

    Returns:
        list: A list containing all n-grams across the specified range.
    """
    all_ngrams = []
    for n in range(ngram_range[0], ngram_range[1] + 1):
        ngrams_for_n = n_grams(tokens, n) 
        all_ngrams.extend(ngrams_for_n)  
    return all_ngrams


def find_ngrams(text, ngrams_range):
    """
    Find the index positions of n-grams from a token list in a global 'words_list'.

    Args:
        text (list): List of tokens from which n-grams will be generated.
        ngrams_range (tuple): Tuple indicating the minimum and maximum sizes of n-grams to generate.

    Returns:
        list: List of indices from 'words_list' where each n-gram can be found, or 0 if not found.
    """
    a = range_ngrams(text, ngrams_range)
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi


def is_vowel(c):
    c = c.lower()
    v_set = list('aeiou')
    if c in v_set:
        return 1
    return 0


In [48]:
with open('words_list.pkl', 'rb') as f:
    words_list = pickle.load(f)

In [49]:
X_ngram_data = np.array(df.name.apply(lambda c: find_ngrams(c, NGRAMS)))

In [50]:
X_feature = df[['length', 'fl_is_v', 'll_is_v']]

In [51]:
X_ngram_data = sequence.pad_sequences(X_ngram_data, maxlen=25)
X_feature_list = X_feature.values.tolist()
X_feature_list = np.array(X_feature_list)
print(type(X_ngram_data))
print(type(X_feature_list))
print(X_feature_list.shape)
print(X_ngram_data.shape)
final_feature = np.hstack((X_ngram_data, X_feature_list))
print(final_feature.shape)
final_feature

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(41376, 3)
(41376, 25)
(41376, 28)


array([[ 40, 142,  89, ...,  14,   0,   0],
       [  0,   0,   0, ...,   8,   0,   0],
       [  0,   0,   0, ...,  10,   0,   1],
       ...,
       [ 10, 193,  93, ...,  15,   0,   0],
       [  0,   0,   0, ...,  11,   0,   1],
       [  0,   0,   0, ...,  12,   0,   0]])

In [52]:
df = pd.DataFrame(final_feature)
df2 = pd.DataFrame(X_ngram_data)

df.to_csv('deeplearning_features.csv', index=False)
df2.to_csv('deeplearning_ngram_features.csv', index=False)


In [53]:
final_feature.shape

(41376, 28)