# Proceso de clasificación de comentarios

Un procesamiento alternativo al caso anterior para poder obtener la probabilidad de ocurrencia sin necesidad de calcular la clase general (omitir columna binary) 

In [24]:
import pandas as pd
import numpy as np
import re, string

df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./comments.csv',encoding='utf-16', sep=',',  error_bad_lines=False)

cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

def clean_text_round1(text):
    '''make text lowercase, remove punctuation.'''
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[\n]', ' ', text)
    text = re.sub('@\w+', '', text)
    text = re.sub('rt', '', text)
    text = re.sub("(http://.*?\s)|(http://.*)",'',text)
    return text

df_train.comment_text = df_train.comment_text.apply(lambda x: clean_text_round1(x))
df_test.rename(columns={'comment':'comment_text'}, inplace=True)
df_test.comment_text = df_test.comment_text.apply(lambda x: clean_text_round1(x))
df_train.head(10)


b'Skipping line 41116: expected 7 fields, saw 11\nSkipping line 41247: expected 7 fields, saw 11\nSkipping line 67200: expected 7 fields, saw 11\nSkipping line 69160: expected 7 fields, saw 11\n'


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww he matches this background colour im seem...,0,0,0,0,0,0
2,000113f07ec002fd,hey man im really not trying to edit war its j...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cant make any real suggestions on impr...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
5,00025465d4725e87,congratulations from me as well use the tool...,0,0,0,0,0,0
6,0002bcb3da6cb337,cocksucker before you piss around on my work,1,1,1,0,1,0
7,00031b1e95af7921,your vandalism to the matt shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,sorry if the word nonsense was offensive to yo...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [25]:
df_train_subset = df_train.loc[:,cols]
df_train_text = df_train.loc[:, 'comment_text']
df_train_subset.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


## Vectorizar los datos de texto
Utilizando un modelo de tf-idf podemos capturar el vocabulario a analizar y poder vectorizarlo de manera numérica la configuración del tf-idf que utilizamos fue la siguiente:

* stop_words: en inglés
* sublinear_tf: le asigna un peso a cada token palabra
* lowercase: pasar toda la info a minúscula
* strip_accents: Quita los caracteres de acentuación
* analyzer: analizamos siempre a nevel de palabras
* token pattern: palabras de 2 o mas caracteres
* ngram_range: permitir grupos de 1, 2 o 3 palabras
* max_features: cantidad máximas de features a capturar 50000


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(df_train_text, df_train_subset, test_size= 0.3, random_state=13)

# Instantiate the vectorizer
word_vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    lowercase = True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{2,}',  #vectorize 2-character words or more
    ngram_range=(1, 3), #unigrams and trigrams
    max_features=50000)

# fit and transform on it the training features
word_vectorizer.fit(X_train)
X_train_word_features = word_vectorizer.transform(X_train)

#transform the test features to sparse matrix
test_features = word_vectorizer.transform(X_test)

word_features = word_vectorizer.transform(df_test['comment_text'])


In [27]:
feature_names = np.array(word_vectorizer.get_feature_names())
sorted_by_idf = np.argsort(word_vectorizer.idf_)
print("Features with lowest idf:\n{}".format(
       feature_names[sorted_by_idf[:4]]))
print("\nFeatures with highest idf:\n{}".format(
       feature_names[sorted_by_idf[-4:]]))

Features with lowest idf:
['article' 'page' 'talk' 'wikipedia']

Features with highest idf:
['faggotjeske couriano' 'faggotjeske couriano stupid' 'faggot gay'
 'criminalwar']


## Implementar modelos

En esta sección se implementan los modelos `Naive Bayes Multinomial` y `Naive Bayes Bernoulli` la literatura recomienda Bernoulli para analizar datos que tienen comportamientos binarios de verdadero o falso, lo cual se ajustan a los datos que presentamos, en un ciclo para clase de comentario tóxico entrenamos el modelo, lo exportamos y lo evaluamospara seleccionar el mejor

In [28]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, precision_score
import sklearn.metrics as metrics
from tqdm import tqdm
import pickle

df_classification_report = pd.DataFrame(columns=['Class_Name','Log_loss_MN', 'Accuracy_MN', 'Accuracy_MN_vs_Test','Log_loss_BN', 'Accuracy_BN', 'Accuracy_BN_vs_Test'])
losses = []
auc = []
auctest= []
lossesBN = []
aucBN = []
aucBNtest = []
dict_confussion_matrix = {} # TODO ADD ELEMENTS TO DICT


for class_name in tqdm(cols):
    train_target = y_train[class_name]
    test_target  = y_test[class_name]
    
    #modelo multinomial naive bayes
    clf = MultinomialNB()
    
    cv_loss = np.mean(cross_val_score(clf, X_train_word_features, train_target,  cv=3, scoring='neg_log_loss'))
    losses.append(cv_loss)
    clf.fit(X_train_word_features, train_target)
    
    cv_score = precision_score(train_target, clf.predict(X_train_word_features))
    auc.append(cv_score)
    
    y_pred = clf.predict(test_features)
    y_pred_prob = clf.predict_proba(test_features)[:, 1]
    auc_score = metrics.roc_auc_score(test_target, y_pred)
    auctest.append(auc_score) 
    
    # TODO EXPORT CLASSIFIER AS PICKLE OBJECT ALSO THE TFIDF
    pickle.dump( clf, open( "multinomial_"+class_name+".pkl", "wb" ) )
    #plot confusion matrix
    confusion_matrix(test_target, y_pred)
    
    #modelo bayes bernoulli teoricamente trabaja mejor con variables binarias
    clf2 = BernoulliNB()
    cv_loss = np.mean(cross_val_score(clf2, X_train_word_features, train_target, cv=3, scoring='neg_log_loss'))
    lossesBN.append(cv_loss)
    clf2.fit(X_train_word_features, train_target)
    
    cv_score = precision_score(train_target, clf2.predict(X_train_word_features))
    aucBN.append(cv_score)
    y_pred = clf2.predict(test_features)
    y_pred_prob = clf2.predict_proba(test_features)[:, 1]
    auc_score = metrics.roc_auc_score(test_target, y_pred)
    aucBNtest.append(auc_score)
    pickle.dump( clf2, open( "bernoulli_"+class_name+".pkl", "wb" ) )

df_classification_report['Class_Name'] = cols
df_classification_report['Log_loss_MN'] = losses
df_classification_report['Accuracy_MN'] = auc
df_classification_report['Accuracy_MN_vs_Test'] = auctest
df_classification_report['Log_loss_BN'] = lossesBN
df_classification_report['Accuracy_BN'] = aucBN
df_classification_report['Accuracy_BN_vs_Test'] = aucBNtest


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.62it/s]


In [29]:
df_classification_report

Unnamed: 0,Class_Name,Log_loss_MN,Accuracy_MN,Accuracy_MN_vs_Test,Log_loss_BN,Accuracy_BN,Accuracy_BN_vs_Test
0,toxic,-0.177276,0.948737,0.700372,-1.287204,0.211122,0.765914
1,severe_toxic,-0.05069,0.420455,0.507131,-0.451709,0.27654,0.548347
2,obscene,-0.122314,0.901398,0.68053,-0.333266,0.187398,0.836873
3,threat,-0.025641,0.086207,0.506554,-0.416821,0.004228,0.500865
4,insult,-0.131988,0.830539,0.630679,-0.34452,0.182151,0.833506
5,identity_hate,-0.056889,0.186275,0.500863,-0.566038,0.109932,0.520863


In [30]:
submission_multinomial_nb = pd.DataFrame.from_dict({'id': df_test['id']}) #DATAFRAME DE SALIDA
submission_bernoulli_nb = pd.DataFrame.from_dict({'id': df_test['id']}) #DATAFRAME DE SALIDA
for class_name in tqdm(cols):
    clf  = pickle.load( open( "multinomial_"+class_name+".pkl", "rb" ) )
    clf2 = pickle.load( open( "bernoulli_" +class_name+".pkl", "rb" ) )
    
    y_pred_prob = clf.predict_proba(word_features)[:, 1]
    submission_multinomial_nb[class_name] = y_pred_prob
    
    y_pred_prob = clf2.predict_proba(word_features)[:, 1]
    submission_bernoulli_nb[class_name] = y_pred_prob
    
submission_multinomial_nb = pd.merge(submission_multinomial_nb, df_test, on='id')
submission_bernoulli_nb = pd.merge(submission_bernoulli_nb, df_test, on='id')

    

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 25.60it/s]


## Exportar resultados
Esta sección se encarga de analizar y exportar los resultados de los dataframes de salida al bucket en s3, se observa que los resultados de ernoulli tienen a ser mas extremos o muy cercanos a cero o muy cercanos a 1 pero no existen valores intermedios.

In [31]:
submission_multinomial_nb.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text,creation_time,source,tweet_id,user,user_id
0,0,0.105708,0.004472,0.041888,0.001209,0.042474,0.004801,rt nosurrenderhk guardiannews the truth is hk ...,2020-06-13 16:07:02,Twitter for Android,1271836487663808513,currentecalamo,1005852785609326592
1,1,0.036546,0.00058,0.011783,0.000135,0.011103,0.000622,rt marisakabas content warning police brutalit...,2020-06-13 16:07:02,Twitter for Android,1271836487663775744,Donald Dire,1088300096666591232
2,2,0.295848,0.000986,0.04582,0.000123,0.044702,0.001886,rt shahmiruk it is absolutely unfair amp disin...,2020-06-13 16:07:02,Twitter for iPhone,1271836487693275144,tanya cochrane 🕷#FBPE,25872176
3,3,0.065238,0.002006,0.024562,0.000536,0.022834,0.003012,rt autotheoryqueen terfs police the boundaries...,2020-06-13 16:07:02,Twitter for iPhone,1271836487747862529,Michael Bermingham,59031350
4,4,0.115526,0.001275,0.023836,0.000428,0.020619,0.001658,looks like a scary demon witch monster to me,2020-06-13 16:07:02,Twitter for iPhone,1271836487676579841,untossable chum,2771192143


In [33]:
submission_bernoulli_nb.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text,creation_time,source,tweet_id,user,user_id
0,0,0.997282,1.092819e-12,0.779152,1.607791e-56,0.853161,3.243856e-15,rt nosurrenderhk guardiannews the truth is hk ...,2020-06-13 16:07:02,Twitter for Android,1271836487663808513,currentecalamo,1005852785609326592
1,1,0.427635,4.902138e-15,0.018746,1.567621e-56,0.040786,8.615133000000001e-17,rt marisakabas content warning police brutalit...,2020-06-13 16:07:02,Twitter for Android,1271836487663775744,Donald Dire,1088300096666591232
2,2,0.999996,5.829787e-13,0.994642,6.330744e-57,0.994836,7.841235e-14,rt shahmiruk it is absolutely unfair amp disin...,2020-06-13 16:07:02,Twitter for iPhone,1271836487693275144,tanya cochrane 🕷#FBPE,25872176
3,3,0.994582,2.830223e-14,0.756531,7.192166000000001e-60,0.779185,2.081745e-16,rt autotheoryqueen terfs police the boundaries...,2020-06-13 16:07:02,Twitter for iPhone,1271836487747862529,Michael Bermingham,59031350
4,4,0.999635,1.908485e-14,0.920415,3.875811e-59,0.958583,9.044567000000001e-17,looks like a scary demon witch monster to me,2020-06-13 16:07:02,Twitter for iPhone,1271836487676579841,untossable chum,2771192143
