In [None]:
%run "Funciones.py"
%matplotlib inline

## Entrenamiento con redes neuronales

---

### Con limpieza específica basada en el analisis preliminar

In [None]:
# Importo los datos como dataframe de pandas
real = pd.read_csv("Data/True.csv")
fake = pd.read_csv("Data/Fake.csv")

In [None]:
# Agrego etiquetas
real['fake?'] = 0
fake['fake?'] = 1

In [None]:
fake.head()

### Aplicamos alguna limpieza previa

Difference in Text

Real news seems to have source of publication which is not present in fake news set

Looking at the data:

*most of text contains reuters information such as "WASHINGTON (Reuters)"

*Some text are tweets from Twitter

*Few text do not contain any publication info

In [None]:
#First Creating list of index that do not have publication part
unknown_publishers = []
for index,row in enumerate(real.text.values):
    try:
        record = row.split(" -", maxsplit=1)
        #if no text part is present, following will give error
        record[1]
        #if len of piblication part is greater than 260
        #following will give error, ensuring no text having "-" in between is counted
        assert(len(record[0]) < 260)
    except:
        unknown_publishers.append(index)

In [None]:
#Thus we have list of indices where publisher is not mentioned
#lets check
real.iloc[unknown_publishers].text
#true, they do not have text like "WASHINGTON (Reuters)"

In [None]:
#Seperating Publication info, from actual text
publisher = []
tmp_text = []
for index,row in enumerate(real.text.values):
    if index in unknown_publishers:
        #Add unknown of publisher not mentioned
        tmp_text.append(row)
        
        publisher.append("Unknown")
        continue
    record = row.split(" -", maxsplit=1)
    publisher.append(record[0])
    tmp_text.append(record[1])

In [None]:
#Replace existing text column with new text
#add seperate column for publication info
#real["publisher"] = publisher
real["text"] = tmp_text

del publisher, tmp_text, record, unknown_publishers

In [None]:
real.head()

In [None]:
#dropping this record
real = real.drop(8970, axis=0)

#### Hasta aca se elimino la agencia que solo esta presente en las noticias reales

In [None]:
# Mezclamos los 2 dataframes y los ramdomisamos el orden
news = real.append(fake, ignore_index=True)
news = shuffle(news)
news

In [None]:
news.sample(10)

In [None]:
#news_mask = news['text'] == ' '
#news = news.drop(news[news_mask].index)
#news

In [None]:
#Combinamos columnas de texto
news['text'] = news['title'] + " " + news['text']
news.drop(['title', 'subject', 'date'], axis=1, inplace=True)
news.sample(5)

In [None]:
news.shape

In [None]:
news['fake?'].value_counts()

### Continuamos proceso de limpieza

In [None]:
# Todo en lower, Html parser, Borrar corchetes, Borrar urls, Borrar stopwords
news['text'] = news['text'].apply(denoise_text)

In [None]:
news.sample(3)

Hasta aqui se hizo limpieza general de formato

---

### Eliminación de palabras muy relevantes que surgen de los analisis previos

In [None]:
# Importo los datos como dataframe de pandas
# Estos archivos se generaron en la notebook FakeNews02
importancia1 = pd.read_csv("Data/Importancia1.csv")
importancia2 = pd.read_csv("Data/Importancia2.csv")
importancia3 = pd.read_csv("Data/Importancia3.csv")

In [None]:
importancias = importancia1.copy()
importancias

In [None]:
importancias = importancias.append(importancia2)

In [None]:
importancias = importancias.append(importancia3)
importancias.drop_duplicates(['atributo'], inplace=True)

In [None]:
importancias = np.array(importancias.atributo)
type(importancias)

In [None]:
importancias

In [None]:
separador = '|'
re_pattern = separador.join(importancias) + '+'

In [None]:
re_pattern

In [None]:
def remove_importancias(text):
    return re.sub(re_pattern, '', text)

In [None]:
news['text'].apply(remove_importancias)

Hasta aqui se hizo limpieza palabras importantes

---

### Separación de sets para entrenamiento, validación y testeo

In [None]:
# Se separan el dataset en los sets de train (luego el de train en validacion) y el de test
# este enfoque se justifica previo a la tokenizacion
train, test = train_test_split(news, test_size=0.3, random_state=42)
train, validation = train_test_split(train, test_size = 0.2, random_state=42)
print(len(train), len(validation), len(test) )

---

In [None]:
vocabulario_max = 20000
filtrar = "!#$%&()*+,-./':;<=>?@[\\]^_`{|}~\t\n"
tokenizer = Tokenizer(num_words=vocabulario_max, filters=filtrar, lower=True, split=" ")
tokenizer.fit_on_texts(train.text)
word_index = tokenizer.word_index

In [None]:
# summarize what was learned
#print(tokenizer.word_counts)
#print(tokenizer.document_count)
#print(tokenizer.word_index)
#print(tokenizer.word_docs)

In [None]:
train_matrix = tokenizer.texts_to_matrix(np.array(train.text))
val_matrix = tokenizer.texts_to_matrix(np.array(validation.text))
test_matrix = tokenizer.texts_to_matrix(np.array(test.text))

In [None]:
type(train_matrix)

In [None]:
train_matrix.shape

In [None]:
train_matrix[0:2,:10]

In [None]:
val_matrix.shape

In [None]:
test_matrix.shape

In [None]:
x_train = np.copy(train_matrix)
x_val = np.copy(val_matrix)
x_test = np.copy(test_matrix)
y_train = train['fake?'].values
y_val = validation['fake?'].values
y_test = test['fake?'].values


### Aqui comienza la configuracion y entrenamiento de la red base

In [None]:
# Creamos un modelo con dos capas ocultas de 16 neuronas cada una
model = Sequential(name="modelo_base")
model.add(Dense(16, activation='relu', input_shape=(20000,)))
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
# Instanciamos nuestro objeto early_stopping y definimos una lista de callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, restore_best_weights=True, verbose=1)
callbacks_list = [early_stopping]

In [None]:
# Compilamos el modelo
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Entrenamos
history = model.fit(x=x_train, y=y_train,
                    epochs=20, batch_size=512, callbacks=callbacks_list, validation_data=(x_val, y_val))

In [None]:
# El atributo `history` contiene un diccionario de métricas por epoch
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,8))
plt.title('Modelo de base')
plt.plot(epochs, loss)
plt.plot(epochs, val_loss)
plt.xticks(ticks=epochs)
plt.ylabel('Loss')
plt.legend(['Training loss', 'Validation loss'])
plt.savefig('Graficos/10_Loss.png')

plt.figure(figsize=(12,8))
plt.plot(epochs, acc)
plt.plot(epochs, val_acc)
plt.xticks(ticks=list(epochs))
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Training accuracy', 'Validation accuracy']);
plt.savefig('Graficos/11_Accuracy.png')

### Testeo

In [None]:
y_pred = model.predict_classes(x_test)

In [None]:
y_pred.shape

In [None]:
np.reshape(y_pred, y_pred.shape[0])

In [None]:
y_test[0:5]

In [None]:
print(classification_report(y_test, y_pred, target_names = ['Fake','Not Fake']))

In [None]:
cm = confusion_matrix(y_test, y_pred , labels=[0, 1])

In [None]:
plt.figure(figsize=(10, 10))
hm = sns.heatmap(cm, annot=True, fmt='.0f')
plt.ylabel('Verdaderos')
plt.title('Cofusion Matrix - Base Model')
plt.xlabel('Predichos');

fig = hm.get_figure()
fig.savefig('Graficos/12_HashMap.png')