In [None]:
!pip install langdetect

In [None]:
from google.cloud import bigquery
from langdetect import detect
from google.colab import auth,drive
import pandas as pd

In [None]:
drive.mount('/content/drive/')

In [None]:
auth.authenticate_user()

In [None]:
project_id = 'tlac-vision'

client = bigquery.Client(project=project_id)
df = client.query('''
  SELECT
    *
  FROM
    `tlac-vision.book_backend.train_categories`
''').to_dataframe()

In [None]:
# sorting dataframe by book category, from A to Z
df = df.sort_values(['category'], ascending=True)
# dropping duplicate rows
df = df.drop_duplicates(['title']).reset_index(drop=True)

In [None]:
# deleting results that are in other languages different to spanish
df['language'] = ""
for index, row in df.iterrows():
    row['language'] = detect(row['description'])
df = df[df.language == 'es']

cat_count = df.iloc[:,0:2].groupby('category').count().rename(columns={'title': 'count'})  # total of registers per category

In [None]:
# 75% of data is for training
cat_count['training'] = round(0.75 * cat_count['count'], 0)
# organizing indexes to split data
cat_count['acum'] = cat_count['count'].cumsum()
cat_count['init_idx'] = cat_count['acum'] - cat_count['count']
cat_count['train_idx'] = cat_count['init_idx'] + cat_count['training']

In [None]:
# creating training set with 75% of data per category
train_set = pd.DataFrame(data=None, columns=df.columns)
for i in range(len(cat_count)):
    i_idx = int(cat_count.iloc[i, 3])
    f_idx = int(cat_count.iloc[i, 4])
    train_set = train_set.append(df.iloc[i_idx:f_idx, :])

train_set_f = train_set.loc[:, ['category', 'description']]
train_set_f['is_valid']=False
train_set_f.iloc[0:5,]

In [None]:
# creating validation set with 25% of data per category
val_set = pd.DataFrame(data=None, columns=df.columns)
for i in range(len(cat_count)):
    i_idx = int(cat_count.iloc[i, 4])
    f_idx = int(cat_count.iloc[i, 2])
    val_set = val_set.append(df.iloc[i_idx:f_idx, :])

val_set_f = val_set.loc[:, ['category', 'description']]
val_set_f['is_valid']=True
val_set_f.iloc[0:5,]

In [None]:
dff = pd.concat([train_set_f, val_set_f]).reset_index(drop=True)