In [1]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

In [2]:
import shutil

original = r'/kaggle/input/helper/helper.py'
target = r'/kaggle/working/helper.py'
shutil.copyfile(original, target)

original = r'/kaggle/input/helper/toxic_helper.py'
target = r'/kaggle/working/toxic_helper.py'
shutil.copyfile(original, target)

original = r'/kaggle/input/helper/multiclass_helper.py'
target = r'/kaggle/working/multiclass_helper.py'
shutil.copyfile(original, target)

original = r'/kaggle/input/helper/fnn_helper.py'
target = r'/kaggle/working/fnn_helper.py'
shutil.copyfile(original, target)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np



# Importo dataset

In [4]:
folder = './'

In [5]:
train = pd.read_csv('../input/toxic-comments-challenge/train.csv')
test = pd.read_csv('../input/toxic-comments-challenge/test.csv')
submission = pd.read_csv('../input/toxic-comments-challenge/sample_submission.csv')

In [6]:
print("A quick view of training set")
train.head(10)

In [7]:
print("A quick view of testing set")
test.head(10)

In [8]:
train[:10]

In [9]:
# cantidad de observaciones
train.shape

In [10]:
# salida del modelo, list_classes son las posibles categorias a las que puede pertenecer mi comentario
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

In [11]:
y[:10]
# veo que palabra de train pertenece a cual de las clases (0 ->  no pertenece, 1 -> pertenece)

In [12]:
toxic_ratio = (y.sum(axis = 1) > 0).sum()/y.shape[0]
print('Porcentaje de toxic comments:', toxic_ratio)

In [13]:
print(train[list_classes].sum())
print('Se ve que una gran cantidad de comentarios son toxicos')

# sumo las columnas de y para ver la cantidad de palabras en cada categoria

In [14]:
# Multilabel: para ver la cantidad de comentarios que son "X" pero no son toxic.
for cl in list_classes[1:]:
    N = ((train['toxic'] == 0) & (train[cl] == 1)).sum()
    print(f'Es {cl} pero no es toxic:', N)
print()
print('Se ve que si es severe_toxic => es toxic, pero ser identity_hate o insult no implica ser toxic')

# Divido entre train y validacion

In [15]:
X_train, X_valid, Y_train, Y_valid = train_test_split(train, y, test_size = 0.1)

print(X_train.shape, X_valid.shape)
print(Y_train.shape, Y_valid.shape)

# Analisis rapido

In [16]:
toxic_ratio = (Y_train.sum(axis = 1) > 0).sum()/Y_train.shape[0]
print('El porcentaje de comentarios toxicos es:')
print(toxic_ratio)


In [17]:
0.96-toxic_ratio + toxic_ratio*0.873
# esto no entiendo que hace

In [18]:
X_train.shape

In [19]:
# paso todo a minuscula para procesar mas rapido 

raw_text_train = X_train["comment_text"].str.lower()
raw_text_valid = X_valid["comment_text"].str.lower()
raw_text_test = test["comment_text"].str.lower()

In [20]:
print(raw_text_train[0:10]) # Recordar que train_test_split hace shuffle 

In [21]:
Y_train[:10]

# Stemming o Lemmatizer

Se puede probar sacando el stemming o el Lemmatizer y ver como se afecta el resultado

In [22]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [23]:
from nltk.stem.porter import PorterStemmer
import re
token_pattern=r"(?u)\b\w\w+\b"
compiled_reg_exp = re.compile(token_pattern)
def tokenize(text):
    tokens = compiled_reg_exp.findall(text)
    stems = []
    for item in tokens:
        if len(item)>100:
            item = 'tooLongWord'
        stems.append(PorterStemmer().stem(item))
    return stems

# Armo matriz de features

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer le da pesos a cada una de las palabras del vocabulario
# tf idf vectorizer transforma texto a vectores de features que pueden ser usadas como entrada al estimador

max_features = 10000

tfidf_vectorizer = TfidfVectorizer(max_df=0.11, min_df=1,
                                   max_features=max_features,
                                   tokenizer=tokenize,
                                   stop_words='english')

%time tfidf_matrix_train = tfidf_vectorizer.fit_transform(raw_text_train)

In [25]:
%time tfidf_matrix_valid = tfidf_vectorizer.transform(raw_text_valid)

In [26]:
%time tfidf_matrix_test = tfidf_vectorizer.transform(raw_text_test)

## Sparsity

In [27]:
# sparcity es el porcentaje de ceros que tengo en mi matriz (sparcity = 0.99 es que el 99% de la matriz es 0)
sparsity = 1 - (tfidf_matrix_train>0).sum()/(tfidf_matrix_train.shape[0]*tfidf_matrix_train.shape[1])
print(sparsity)

## TFIDF Results

In [28]:
tfidf_matrix_train.shape

sin lemma
- 177719 con 0.95, 1
- 177712 con 0.11, 1


In [29]:
top_10 = np.argsort(tfidf_matrix_train.sum(axis=0))[0,::-1][0,:10].tolist()[0]
feature_names = np.array(tfidf_vectorizer.get_feature_names())
print(feature_names[np.array(top_10)])

## Sin reducción de dimensionalidad

In [30]:
# dense_matrix_train = tfidf_matrix_train.todense()

In [31]:
# dense_matrix_train.shape, Y_train.shape

In [32]:
# dense_matrix_valid = tfidf_matrix_valid.todense()

## Reducimos dimensionalidad

In [33]:
from sklearn.decomposition import TruncatedSVD

In [34]:
trunSVD = TruncatedSVD(n_components=300)
%time dense_matrix_train = trunSVD.fit_transform(tfidf_matrix_train)

In [35]:
%time dense_matrix_valid = trunSVD.transform(tfidf_matrix_valid)

In [36]:
dense_matrix_train.shape, dense_matrix_valid.shape

In [37]:
%time dense_matrix_test = trunSVD.transform(tfidf_matrix_test)

In [38]:
# matriz esparsa -> densa. Por eso uso truncated SVD, porque hace cuenta rapido con matrices esparsas
tfidf_matrix_train[0].todense()
# es cuestion de como esta representada internamente

# Modelo de 1 capa densa
Armar el modelo

In [39]:
# !pip install toxic_helper
# !pip install detoxify
from toxic_helper import auc

In [40]:
# a mano
# from keras.models import Sequential
# from keras.layers import Dense


In [41]:
# model= Sequential()

In [42]:
# model.add(Dense(6, activation='sigmoid',input_shape=(300,)))

In [43]:
# model.summary() 
# numero de parametros es 1806 = 6 * 300 (entradas) + 6 de cada suma

In [44]:
# debo definir la funcion de costo y el optimizador, cuando compilo el modelo
# model.compile(loss="binary_crossentropy", optimizer = "sgd")
# en vez de sgd en optimizer se puede poner "adam" y probar. Adam tiene optimizaciones que vamos a ver

In [45]:
# fit manda datos y anotaciones (y_train)
# model.fit(dense_matrix_train, Y_train, batch_size= 128, epochs= 10)

In [46]:
# pred_test= model.predict(dense_matrix_test)

In [47]:
from tensorflow import keras

from keras.models import Sequential
from keras import optimizers
from keras.layers.core import Dense, Activation
from helper import PlotLosses
from keras.callbacks import ModelCheckpoint
from keras.initializers import RandomNormal
from keras import regularizers
from keras import initializers

In [48]:
default_initializer = initializers.normal(mean=0, stddev=0.001)
# default_initializer = 'zeros'

In [49]:
input_features = dense_matrix_train.shape[1]
output_size = Y_train.shape[1]
hidden_units = 400
lambd = 0 #0.001
model_sig_nn = Sequential()
model_sig_nn.add(Dense(hidden_units,
                       input_dim=input_features, 
                       kernel_regularizer=regularizers.l2(lambd), 
                       kernel_initializer=default_initializer,
                       name="Capa_Oculta_1"))
model_sig_nn.add(Activation('sigmoid'))
model_sig_nn.add(Dense(hidden_units,
                       input_dim=input_features, 
                       kernel_regularizer=regularizers.l2(lambd), 
                       kernel_initializer=default_initializer,
                       name="Capa_Oculta_2"))
model_sig_nn.add(Activation('sigmoid'))
model_sig_nn.add(Dense(output_size,
                       kernel_regularizer=regularizers.l2(lambd), 
                       kernel_initializer=default_initializer,
                       name="Capa_Salida"))
model_sig_nn.add(Activation('sigmoid', name="output")) 
model_sig_nn.summary()


lr = 0.0001 
batch_size = 256
epochs = 30

#selectedOptimizer = keras.optimizers.SGD(lr=lr)
#selectedOptimizer = optimizers.adam(lr=lr, decay=0.001)
selectedOptimizer = keras.optimizers.Adam(learning_rate=lr)
model_sig_nn.compile(loss = 'binary_crossentropy', optimizer=selectedOptimizer, 
                     metrics=['accuracy']) #auc

In [50]:
model_sig_nn.evaluate(dense_matrix_valid, Y_valid)

In [51]:
from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='/kaggle/working/basic_model_best.hdf5', verbose=1, save_best_only=True)

plot_losses = PlotLosses(plot_interval=1, 
                         evaluate_interval=5, 
                         x_val=dense_matrix_valid, 
                         y_val_categorical=Y_valid)
history = model_sig_nn.fit(dense_matrix_train, 
          Y_train, 
          batch_size = batch_size,
          epochs=epochs, 
          verbose=1, 
          validation_data=(dense_matrix_valid, Y_valid), 
          callbacks=[plot_losses, checkpointer],
         )

# Evaluo valid

In [52]:
model_sig_nn.load_weights('/kaggle/working/basic_model_best.hdf5')

In [53]:
tfidf_matrix_valid.shape, Y_valid.shape

In [54]:
pred_valid = model_sig_nn.predict(dense_matrix_valid, verbose = 1)
pred_train = model_sig_nn.predict(dense_matrix_train, verbose = 1)
pred_test = model_sig_nn.predict(dense_matrix_test, verbose = 1)

In [55]:
model_sig_nn.evaluate(dense_matrix_valid, Y_valid)

[0.073181636426992352, 0.97782721309109666]

# ROC Curve

In [56]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from scipy import interp
from itertools import cycle

print(roc_auc_score(Y_train, pred_train, average='macro'))
print(roc_auc_score(Y_valid, pred_valid, average='macro'))

0.930758815168  
0.920441587397

In [57]:
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = Y_valid.shape[1]
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(Y_valid[:, i], pred_valid[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
fpr["micro"], tpr["micro"], _ = roc_curve(Y_valid.ravel(), pred_valid.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [58]:
from matplotlib import pyplot as plt
# Compute macro-average ROC curve and ROC area
lw = 2
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

**True Positive Rate**:   
The number of times your system was able to classify the positives as positives. 

True positive rate = Correctly Classified Positives/(Correctly Classified as Positives+ Falsely Classified as Negatives)

**False Positive Rate**:  
The number of times your system classified a negative as a positive divided by the total  actual negative instances.


False positive rate = Incorrectly Classified as Positives/(Incorrectly Classified as Positives+ Correctly classified as Negatives )

https://en.wikipedia.org/wiki/Receiver_operating_characteristic

https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001

Macro-average: Calcula el score de cada clase y luego promedia  
Micro-average: Suma y luego calcula el score

Micro-average se considera mejor cuando hay desbalce en las clases

# Interpretación

- toxic
- severe_toxic
- obscene
- threat
- insult
- identity_hate

In [59]:
(model_sig_nn.get_weights()[0]).shape

In [60]:
salida = 2
sorted_indexes = np.argsort(model_sig_nn.get_weights()[0][:,salida])[::-1]
np.array(tfidf_vectorizer.get_feature_names())[sorted_indexes][:20]

# Predict for test

In [61]:
tfidf_matrix_test = tfidf_vectorizer.transform(raw_text_test)

In [62]:
# dense_matrix_test = tfidf_matrix_test.todense()

In [63]:
pred = model_sig_nn.predict(dense_matrix_test, verbose=1)

In [64]:
pred.shape

In [65]:
1*(pred[0:10]>0.5)

In [66]:
submission[list_classes] = pred_test
submission.to_csv("prueba15.csv", index = False)

Submit1 ROC AUC en valid = 0.731196488262  
Submit2 ROC AUC en valid = 0.963410980044  
Submit3 ROC AUC en valid = 0.974042855266

In [67]:
# ~/.local/bin/kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission_early_stop_2_epochs.csv -m "Early stop 2 epochs"