# Proceso de clasificación de comentarios

Un procesamiento alternativo al caso anterior para poder obtener la probabilidad de ocurrencia sin necesidad de calcular la clase general (omitir columna binary) 

In [2]:
import pandas as pd
import numpy as np
import re, string

df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

def clean_text_round1(text):
    '''make text lowercase, remove punctuation.'''
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[\n]', ' ', text)
    text = re.sub('@\w+', '', text)
    text = re.sub('RT', '', text)
    text = re.sub("(http://.*?\s)|(http://.*)",'',text)
    return text

df_train.comment_text = df_train.comment_text.apply(lambda x: clean_text_round1(x))
df_test.comment_text = df_test.comment_text.apply(lambda x: clean_text_round1(x))
df_train.head(10)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww he matches this background colour im seem...,0,0,0,0,0,0
2,000113f07ec002fd,hey man im really not trying to edit war its j...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cant make any real suggestions on impr...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
5,00025465d4725e87,congratulations from me as well use the tool...,0,0,0,0,0,0
6,0002bcb3da6cb337,cocksucker before you piss around on my work,1,1,1,0,1,0
7,00031b1e95af7921,your vandalism to the matt shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,sorry if the word nonsense was offensive to yo...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [5]:
df_train_subset = df_train.loc[:,cols]
df_train_text = df_train.loc[:, 'comment_text']
df_train_subset.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(df_train_text, df_train_subset, test_size= 0.3, random_state=13)

# Instantiate the vectorizer
word_vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    lowercase = True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{2,}',  #vectorize 2-character words or more
    ngram_range=(1, 3), #bigrams and trigrams
    max_features=50000)

# fit and transform on it the training features
word_vectorizer.fit(X_train)
X_train_word_features = word_vectorizer.transform(X_train)

#transform the test features to sparse matrix
test_features = word_vectorizer.transform(X_test)

word_features = word_vectorizer.transform(df_test['comment_text'])


In [14]:
feature_names = np.array(word_vectorizer.get_feature_names())
sorted_by_idf = np.argsort(word_vectorizer.idf_)
print("Features with lowest idf:\n{}".format(
       feature_names[sorted_by_idf[:4]]))
print("\nFeatures with highest idf:\n{}".format(
       feature_names[sorted_by_idf[-4:]]))

Features with lowest idf:
['article' 'page' 'talk' 'wikipedia']

Features with highest idf:
['faggotjeske couriano' 'faggotjeske couriano stupid' 'faggot gay'
 'criminalwar']


In [42]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report
import sklearn.metrics as metrics
from tqdm import tqdm
import pickle

df_classification_report = pd.DataFrame(columns=['Class_Name','Log_loss_MN', 'Accuracy_MN', 'Accuracy_MN_vs_Test','Log_loss_BN', 'Accuracy_BN', 'Accuracy_BN_vs_Test'])
losses = []
auc = []
auctest= []
lossesBN = []
aucBN = []
aucBNtest = []
dict_confussion_matrix = {} # TODO ADD ELEMENTS TO DICT


for class_name in tqdm(cols):
    train_target = y_train[class_name]
    test_target  = y_test[class_name]
    
    #modelo multinomial naive bayes
    clf = MultinomialNB()
    
    cv_loss = np.mean(cross_val_score(clf, X_train_word_features, train_target,  scoring='neg_log_loss'))
    losses.append(cv_loss)

    cv_score = np.mean(cross_val_score(clf, X_train_word_features, train_target, scoring='accuracy')) 
    auc.append(cv_score)
    
    clf.fit(X_train_word_features, train_target)
    y_pred = clf.predict(test_features)
    y_pred_prob = clf.predict_proba(test_features)[:, 1]
    auc_score = metrics.roc_auc_score(test_target, y_pred_prob)
    auctest.append(auc_score) 
    
    # TODO EXPORT CLASSIFIER AS PICKLE OBJECT ALSO THE TFIDF
    pickle.dump( clf, open( "multinomial_"+class_name+".pkl", "wb" ) )
    #plot confusion matrix
    confusion_matrix(test_target, y_pred)
    
    #modelo bayes bernoulli teoricamente trabaja mejor con variables binarias
    clf2 = BernoulliNB()
    cv_loss = np.mean(cross_val_score(clf2, X_train_word_features, train_target,  scoring='neg_log_loss'))
    lossesBN.append(cv_loss)
    
    cv_score = np.mean(cross_val_score(clf2, X_train_word_features, train_target, scoring='accuracy'))
    aucBN.append(cv_score)
    
    clf2.fit(X_train_word_features, train_target)
    y_pred_prob = clf2.predict_proba(test_features)[:, 1]
    auc_score = metrics.roc_auc_score(test_target, y_pred_prob)
    aucBNtest.append(auc_score)
    pickle.dump( clf2, open( "bernoulli_"+class_name+".pkl", "wb" ) )

df_classification_report['Class_Name'] = cols
df_classification_report['Log_loss_MN'] = losses
df_classification_report['Accuracy_MN'] = auc
df_classification_report['Accuracy_MN_vs_Test'] = auctest
df_classification_report['Log_loss_BN'] = lossesBN
df_classification_report['Accuracy_BN'] = aucBN
df_classification_report['Accuracy_BN_vs_Test'] = aucBNtest


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:07<00:00,  1.25s/it]


In [30]:
df_classification_report

Unnamed: 0,Class_Name,Log_loss_MN,Accuracy_MN,Accuracy_MN_vs_Test,Log_loss_BN,Accuracy_BN,Accuracy_BN_vs_Test
0,toxic,-0.168507,0.938316,0.938062,-1.53545,0.661626,0.91396
1,severe_toxic,-0.04931,0.989105,0.921206,-0.374949,0.985076,0.96253
2,obscene,-0.115896,0.962685,0.936481,-0.522364,0.845469,0.9389
3,threat,-0.025703,0.996195,0.828433,-0.446806,0.984816,0.763169
4,insult,-0.125314,0.959257,0.928853,-0.533785,0.852085,0.931289
5,identity_hate,-0.056012,0.98983,0.84865,-0.503205,0.98094,0.911599


In [43]:
submission_multinomial_nb = pd.DataFrame.from_dict({'id': df_test['id']}) #DATAFRAME DE SALIDA
submission_bernoulli_nb = pd.DataFrame.from_dict({'id': df_test['id']}) #DATAFRAME DE SALIDA
for class_name in tqdm(cols):
    clf  = pickle.load( open( "multinomial_"+class_name+".pkl", "rb" ) )
    clf2 = pickle.load( open( "bernoulli_" +class_name+".pkl", "rb" ) )
    
    y_pred_prob = clf.predict_proba(word_features)[:, 1]
    submission_multinomial_nb[class_name] = y_pred_prob
    
    y_pred_prob = clf2.predict_proba(word_features)[:, 1]
    submission_bernoulli_nb[class_name] = y_pred_prob
    
submission_multinomial_nb = pd.merge(submission_multinomial_nb, df_test, on='id')
submission_bernoulli_nb = pd.merge(submission_bernoulli_nb, df_test, on='id')

    

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00,  9.66it/s]


In [44]:
submission_multinomial_nb.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text
0,00001cee341fdb12,0.956822,0.002098,0.742024,5.3e-05,0.522567,0.001949,yo bitch ja rule is more succesful then youll ...
1,0000247867823ef7,0.007265,0.000167,0.002499,5.1e-05,0.002293,0.000152,from rfc the title is fine as it is imo
2,00013b17ad220c46,0.016521,0.000218,0.005404,5.6e-05,0.004855,0.000317,sources zawe ashton on lapland —
3,00017563c3f7919a,0.003667,3e-05,0.001193,5e-06,0.00102,2.4e-05,if you have a look back at the source the info...
4,00017695ad8997eb,0.031284,0.000727,0.013144,0.000116,0.012405,0.000563,i dont anonymously edit articles at all


In [45]:
submission_bernoulli_nb.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text
0,00001cee341fdb12,1.0,0.9598616,1.0,1.28096e-40,1.0,0.9553691,yo bitch ja rule is more succesful then youll ...
1,0000247867823ef7,0.927365,1.809172e-16,0.093966,4.6389199999999997e-63,0.129329,2.948856e-20,from rfc the title is fine as it is imo
2,00013b17ad220c46,0.997806,1.385914e-15,0.865698,1.164451e-63,0.886537,5.172023999999999e-19,sources zawe ashton on lapland —
3,00017563c3f7919a,0.000533,1.332273e-19,4.9e-05,3.0391740000000003e-62,3.2e-05,6.400431e-22,if you have a look back at the source the info...
4,00017695ad8997eb,0.982876,1.846849e-15,0.626982,1.02431e-62,0.687772,6.166766999999999e-19,i dont anonymously edit articles at all
