## Convert Dataset to Vec TF-IDF

In [13]:
import pandas as pd
import os
import json
from datetime import datetime as dt
from multiprocessing import Pool

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import tensorflow as tf 
from tensorflow.python.lib.io import file_io
import _pickle as pickle

In [3]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/jovyan/work/google_key.json'

In [4]:
basepath = 'gs://bitservices-bigdata/skyhub/minos/dataset/20182208-202242/'

In [5]:
with file_io.FileIO('{}dataset.csv'.format(basepath), 'r') as f:
    dataset_df = pd.read_csv(f)
dataset_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,cat1,cat2,cat3,cat4,name
0,0,0,58,2293,7861,1,garf sobrem turim
1,1,1,71,3014,2,1,enfeit mes natalin cas led orb christm
2,2,2,36,2783,5,1,livr malet ades profisso
3,3,3,27,3688,16,1,bol pes medic medicin ball liveup
4,4,4,46,3725,12,10,gel limp fac microesfoliaca jon isotonic fac s...


## Normalize dataframe name

In [6]:
dataset_df['name'] = dataset_df['name'].fillna(' ')

## Shuffle dataset

In [7]:
dataset_df = shuffle(dataset_df)

## Split Dataset in training and test

In [8]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(dataset_df.name, dataset_df.cat1, test_size=0.1, random_state=1001)

## Vectorize

### Dimenstion Reduction

In [15]:
x_to_reduction = TfidfVectorizer().fit_transform(X_train_raw)

In [18]:
svd = TruncatedSVD(n_components=50000, random_state=1001)
svd.fit(x_to_reduction)

MemoryError: 

## Vectorize

In [9]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train_raw)
X_train = vectorizer.transform(X_train_raw)
X_test  = vectorizer.transform(X_test_raw)

with file_io.FileIO('{}tfidf/vectorizer.pk'.format(basepath), 'w+') as f:
    pickle.dump(vectorizer, f)

## LabelBinarizer

In [8]:
laberizer = LabelBinarizer()
laberizer.fit(dataset_df.cat1)
y_train = laberizer.transform(y_train_raw)
y_test  = laberizer.transform(y_test_raw)

with file_io.FileIO('{}tfidf/laberizer.pk'.format(basepath), 'w+') as f:
    pickle.dump(laberizer, f)

## Persist Trainingset

In [10]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def create_record(data):
    features = tf.train.Features(feature={
        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=data[1])),
        'features': tf.train.Feature(float_list=tf.train.FloatList(value=data[0].toarray()[0]))
    })
    
    return tf.train.Example(features=features)

In [None]:
index = 1
chunck_size = 5000
for i in range(0, len(y_train), chunck_size):
    start = dt.now()
    filename = "{}tfidf/trainset/{}.tfrecord".format(basepath, str(index).zfill(5))
    writer = tf.python_io.TFRecordWriter(filename)
    
    with Pool(10) as pool:
        for example in pool.map(create_record, zip(X_train[i:i+chunck_size], y_train[i:i+chunck_size])):
            writer.write(example.SerializeToString())
    
    writer.close()
    print("diff %ds: %s" % ((dt.now() - start).total_seconds(), filename))
    index += 1

diff 129s: gs://bitservices-bigdata/skyhub/minos/dataset/20182208-202242/tfidf/trainset/00001.tfrecord
diff 116s: gs://bitservices-bigdata/skyhub/minos/dataset/20182208-202242/tfidf/trainset/00002.tfrecord
diff 120s: gs://bitservices-bigdata/skyhub/minos/dataset/20182208-202242/tfidf/trainset/00003.tfrecord
diff 139s: gs://bitservices-bigdata/skyhub/minos/dataset/20182208-202242/tfidf/trainset/00004.tfrecord
diff 131s: gs://bitservices-bigdata/skyhub/minos/dataset/20182208-202242/tfidf/trainset/00005.tfrecord
diff 120s: gs://bitservices-bigdata/skyhub/minos/dataset/20182208-202242/tfidf/trainset/00006.tfrecord
diff 128s: gs://bitservices-bigdata/skyhub/minos/dataset/20182208-202242/tfidf/trainset/00007.tfrecord
diff 118s: gs://bitservices-bigdata/skyhub/minos/dataset/20182208-202242/tfidf/trainset/00008.tfrecord
diff 121s: gs://bitservices-bigdata/skyhub/minos/dataset/20182208-202242/tfidf/trainset/00009.tfrecord
diff 118s: gs://bitservices-bigdata/skyhub/minos/dataset/20182208-202242/

## Persist Testset

In [None]:
index = 1
chunck_size = 10000
for i in range(0, len(y_train), chunck_size):
    start = dt.now()
    filename = "{}tfidf/testset/{}.tfrecord".format(basepath, str(index).zfill(5))
    writer = tf.python_io.TFRecordWriter(filename)
    
    with Pool(10) as pool:
        for example in pool.map(create_record, zip(X_test[i:i+chunck_size], y_test[i:i+chunck_size])):
            writer.write(example.SerializeToString())
    
    writer.close()
    print("diff %ds: %s" % ((dt.now() - start).total_seconds(), filename))
    index += 1

In [None]:
y_test

## Persit Metadata

In [None]:
metadata = {
    'train_samples_count': len(y_train),
    'test_samples_count': len(y_test),
    'features_count': len(vectorizer.get_feature_names()),
    'features_names': vectorizer.get_feature_names(),
    'lavels_count': len(laberizer.classes_),
    'labels_names': laberizer.classes_.tolist()
}

with file_io.FileIO('{}tfidf/metadata.json'.format(basepath), 'w+') as f:
    f.write(json.dumps(metadata))

In [None]:
metadata