# Criação do modelo

## Buscando os dados e preparando para treino do modelo

In [43]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.read_excel('jobs_usa_data_scientist.xlsx')
df2 = pd.read_excel('jobs_usa_data_analyst.xlsx')
df3 = pd.read_excel('jobs_usa_data_engineer.xlsx')

In [3]:
df1['target'] = 0
df2['target'] = 1
df3['target'] = 2

In [4]:
df = (df1.append(df2, ignore_index=True)).append(df3, ignore_index=True)

In [7]:
from collections import Counter
import itertools
import nltk
nltk.download('stopwords')
stops = nltk.corpus.stopwords.words('english')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')


def tokenizar(str_texto):
    return word_tokenize(str_texto)

def limpar(lista):
    return [i.lower() for i in lista if i.isalpha()]

def sem_stops(lista):
    return [i for i in lista if i not in stops]

def stemizar(lista):
    return [stemmer.stem(i) for i in lista]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/brunorosilva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/brunorosilva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
df = df.sample(frac=1., random_state=42) # randomizando

In [9]:
df['JobDescription'] = df['JobDescription'].apply(lambda x:stemizar(sem_stops(limpar(tokenizar(x)))))

In [10]:
jobs_desc = df['JobDescription'].values

In [12]:
vocab = set([p for sent in jobs_desc for p in sent])

indices_de_palavras = {palavra: e+1 for e, palavra in enumerate(vocab)}  # e+1 para que o primeiro índice não seja 0, que é um pad

vetores_msg = np.array([[indices_de_palavras[p] for p in d] for d in jobs_desc], dtype=object)
vetores_msg

array([list([7891, 4144, 920, 3768, 10018, 9550, 6343, 7210, 822, 6686, 2659, 6197, 3768, 3084, 6064, 7917, 9081, 2752, 9099, 9866, 920, 3768, 10018, 3856, 8434, 9550, 6343, 7210, 822, 6686, 2659, 6197, 3768, 3084, 6064, 7917, 9081, 920, 3768, 10018, 4348, 4922, 8751, 1395, 1294, 87, 6709, 3435, 5853, 5457, 5069, 87, 6709, 3435, 5519, 4447, 9431, 1872, 5804, 7468, 3491, 9211, 7917, 1674, 3768, 87, 3640, 1794, 3112, 5051, 3768, 8714, 8450, 5516, 2905, 9477, 6931, 1180, 3698, 3854, 8410, 9484, 3680, 7372, 3134, 6709, 9093, 4729, 5605, 9136, 8845, 3467, 2659, 822, 9196, 8670, 5195, 1794, 5551, 2498, 1395, 5519, 3640, 6923, 4975, 3112, 606, 5772, 4464, 2408, 2411, 10350, 1794, 9744, 1988, 9093, 87, 8881, 1794, 1110, 6686, 7958, 2657, 5814, 9498, 2657, 8163, 3112, 6686, 7958, 2657, 3640, 7801, 4474, 5345, 5814, 87, 2882, 443, 87, 1551, 1794, 2659, 87, 5051, 10350, 1794, 5335, 7146, 9196, 1794, 5345, 2657, 5814, 5306, 3854, 5216, 87, 8410, 9431, 4557, 3854, 87, 6337, 5528, 9744, 8014, 4484, 

In [15]:
def binarizar(matriz_int, dim=len(vocab)+1):
    binarizado = np.zeros((len(matriz_int), dim))

    for e, vetor in enumerate(matriz_int):
        binarizado[e, vetor] = 1.

    return binarizado

vetores_msg_bin = binarizar(vetores_msg)

In [17]:
etiquetas = df['target'].values

In [18]:
etiquetas_bin = binarizar(etiquetas, dim=3)

In [19]:

from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(vetores_msg_bin, etiquetas_bin, train_size=0.7, random_state=42)


## Tuning e criação do Relatório

In [21]:
from plotly import graph_objects as go
import pandas as pd
from tqdm.notebook import tqdm
from plotly.subplots import make_subplots
from plotly.io import to_html
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import callbacks

In [40]:
def build_model(units, activation, optimizer, vocab):
    modelo = models.Sequential()
    for i in range(len(units)):
        modelo.add(layers.Dense(units[i], activation=activation, input_shape=(len(vocab)+1,)))
    
    modelo.add(layers.Dense(3, activation='softmax'))
    modelo.compile(optimizer=optimizer, 
            loss="categorical_crossentropy", 
            metrics='accuracy')
    return modelo
def train_model(modelo, train_x, train_y, test_x, test_y, batch):
    aprendeu_parou = callbacks.EarlyStopping(
        min_delta=0.001,  
        patience=5,  
        restore_best_weights=True,
    )

    historia = modelo.fit(
        train_x, 
        train_y, 
        epochs=200, 
        batch_size=batch, 
        validation_data=(test_x, test_y),
        callbacks=[aprendeu_parou],
        verbose=0
    )
    return historia
    
def plot_results(historia):

    dic_historia = historia.history  # dict_keys(['loss', 'acc', 'val_loss', 'val_acc'])
    perda_treino = dic_historia['loss']
    perda_valid = dic_historia['val_loss']

    acuracia_treino = dic_historia['accuracy'] 
    acuracia_valid = dic_historia['val_accuracy']
    epocas = range(1, len(acuracia_treino) + 1)
    tmp = pd.DataFrame({
        "Epocas":epocas,
        "Acurácia_Treino":acuracia_treino,
        "Acurácia_Validação":acuracia_valid,
        "Custo_Treino":perda_treino,
        "Custo_Validação":perda_valid,
    })

    fig = make_subplots(rows=2, cols=1,
        subplot_titles=("Acurácia por Épocas", "Custo por Épocas"))

    fig.add_trace(
        go.Scatter(
            x=tmp['Epocas'],
            y=tmp["Acurácia_Validação"],
            name="Acurácia Validação",
            mode='lines+markers'
        ),
        row=1,
        col=1
    )
    fig.add_trace(
        go.Scatter(
            x=tmp['Epocas'],
            y=tmp["Acurácia_Treino"],
            name="Acurácia Treino",
            mode='lines+markers'
        ),
        row=1,
        col=1
    )
    fig.add_trace(
        go.Scatter(
            x=tmp['Epocas'],
            y=tmp["Custo_Validação"],
            name="Custo Validação",
            mode='lines+markers'
        ),
        row=2,
        col=1
    )
    fig.add_trace(
        go.Scatter(
            x=tmp['Epocas'],
            y=tmp["Custo_Treino"],
            name="Custo Treino",
            mode='lines+markers'
        ),
        row=2,
        col=1
    )
    fig.update_xaxes(title_text="Épocas", row=1, col=1)
    fig.update_xaxes(title_text="Épocas", row=2, col=1)
    fig.update_yaxes(title_text="Acurácia", row=1, col=1, range=[0, 1])
    fig.update_yaxes(title_text="Custo", row=2, col=1, range=[0, 1])
    fig.update_layout(height=1200, width=800)
    fig_html = to_html(fig, full_html=False)
    return fig_html

def build_report(historia, hiperparams, modelo, teste_x, teste_y, first_run=False, last_run=False):
    if first_run:
        fl = open("tuning-process-report.html", 'w')
        fl.write("""
        <html>
            <head>
                <meta charset="UTF-8">
                <style>
                .plotly-graph-div {
                    margin: 0 auto;
                }
                </style>
            </head>
            <body>
                <h1 align=\'center\'>Report do processo de busca do melhor modelo</h1>
                <hr>
        """)
    else:
        fl = open('tuning-process-report.html', 'a')
        fl.write("<hr>")

    fl.write("""
    <h2 align=\'center\'>Hiper parâmetros do modelo</h2>
    <div align=\'center\'>
    """)
    for hiperparam in hiperparams:
        fl.write(str(hiperparam) + ": " + str(hiperparams[hiperparam]) + "<br>")
    
    avaliacao = modelo.evaluate(teste_x, teste_y)
    fl.write(f'''
    Acurácia na avaliação: {str(avaliacao[1])} <br>
    Perda: {str(avaliacao[0])}
    </div>
    ''')
    fig_html = plot_results(historia)
    fl.write(fig_html)
    if last_run:
        fl.write("""
        </body>
        </html>        
        """)
    fl.close()
    return avaliacao[1]
    

In [41]:
hiperparams_grid = {
    "units":[[1], [8], [16, 16], [8, 8, 8, 8, 8, 8, 8, 8], [32, 64, 8], [128, 64], [128, 64, 8]],
    "activations":["tanh", "relu"],
    "batches":[16, 64, 512],
    "optimizers":["SGD", "rmsprop", "adam"]
}

In [42]:
first_run = True
last_run = False
max_runs = 1
best_acc = 0

for f in hiperparams_grid:
    max_runs *= len(hiperparams_grid[f])

for h_units in hiperparams_grid["units"]:
    for h_activation in hiperparams_grid["activations"]:
        for h_batch in hiperparams_grid['batches']:
            for h_optimizer in hiperparams_grid['optimizers']:
                if max_runs == 1:
                    last_run=True
                modelo = build_model(h_units, h_activation, h_optimizer, vocab)
                historia = train_model(modelo, train_x, train_y, test_x, test_y, h_batch)
                current_acc = build_report(historia,
                             {"units":h_units, "activation":h_activation, "batch":h_batch, "optimizer":h_optimizer}, 
                             modelo, test_x, test_y, first_run, last_run)
                if current_acc > best_acc:
                    best_acc = current_acc
                    print(f"New best ACC {round(best_acc*100, 2)} using {h_units, h_activation, h_batch, h_optimizer}")
                first_run=False
                max_runs-=1

New best ACC 85.31 using ([16], 'tanh', 16, 'SGD')
New best ACC 86.49 using ([8, 8, 8], 'tanh', 16, 'SGD')


## Salvando o melhor modelo
Foram feitos diversos testes e o melhor modelo encontrado foi o definido a seguir

In [44]:
rna = models.Sequential([
    layers.Dense(8, activation='tanh', input_shape=(len(vocab)+1,)),
    layers.Dense(8, activation='tanh'),
    layers.Dense(8, activation='tanh'),
    layers.Dense(8, activation='tanh'),
    layers.Dense(8, activation='tanh'),
    layers.Dense(8, activation='tanh'),
    layers.Dense(8, activation='tanh'),
    layers.Dense(8, activation='tanh'),
    layers.Dense(3, activation='softmax')
])

In [45]:
rna.compile(optimizer='SGD', loss='categorical_crossentropy', metrics='acc')

In [46]:
hist = rna.fit(train_x, train_y, epochs=21, validation_data=(test_x, test_y), batch_size=64, verbose=0)

In [49]:
rna.save('model')

INFO:tensorflow:Assets written to: model/assets


In [48]:
# salvando o vocab e os índices
fl = open("vocab.txt", 'w')
fl.write(str(vocab))
fl.close()
fl = open("indice_de_palavras.txt", "w")
fl.write(str(indices_de_palavras))
fl.close()