In [0]:
'''
from google.colab import drive
drive.mount('/content/drive')
'''

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np
import re, string
import copy
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
# disable warnings
def warn(*args, **kwargs):
  pass
import warnings
warnings.warn = warn
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier
from scipy.sparse import hstack


In [0]:
# read preprocessed data
train = pd.read_csv("train_train_final_processed.csv")
val = pd.read_csv("train_val_final_processed.csv")
train['comment_text'] = train['comment_text'].apply(lambda x: x if isinstance(x, str) else "something")
val['comment_text'] = val['comment_text'].apply(lambda x: x if isinstance(x, str) else "something")

In [0]:
# read original data
train_orig = pd.read_csv("train_train_final.csv")
val_orig = pd.read_csv("train_val_final.csv")

In [0]:
# read feature engineered data
df_train_fe = pd.read_csv("train_train_final_fe.csv")
df_val_fe = pd.read_csv("train_val_final_fe.csv")

In [0]:
X_train = train.comment_text
X_val = val.comment_text
print('x_train : ', X_train.shape, type(X_train))
print('x_val : ', X_val.shape, type(X_val))

x_train :  (143614,) <class 'pandas.core.series.Series'>
x_val :  (15957,) <class 'pandas.core.series.Series'>


In [0]:
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [0]:
fe_list = ['afinn_min', 'sentiment', 'personal_pronoun_per_word', 'syllable_std', 'SMOG',
           'num_big_words', 'capital_per_char',
           'big_words_per_word', 'num_preposition', 'syllable_max',
           'lowercase_per_char', 'GFI', 'sentence_std', 'FRE',
           'has_you_then_verb', 'num_words_upper',
           'num_words_title']

In [0]:
new_fe_list = ['num_big_words', 'SMOG', 'GFI', 'num_words_title']

In [0]:
# add features
df_train_fe = df_train_fe[new_fe_list]
df_val_fe = df_val_fe[new_fe_list]

In [0]:
def add_features(df):
    
    df = copy.deepcopy(df)
    df['comment_text'] = df['comment_text'].apply(lambda x:str(x))
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.comment_text.str.count('\S+')
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']

    selected_fe = df[['caps_vs_length', 'words_vs_unique']].fillna(0)
    return selected_fe

In [0]:
# df_train_fe = add_features(train_orig)
# df_val_fe = add_features(val_orig)

In [0]:
# perform minmax scaler on added features
mm = MinMaxScaler()
mm.fit(df_train_fe)
df_train_fe = mm.transform(df_train_fe)
df_val_fe = mm.transform(df_val_fe)

In [0]:
# Tokenize the text with tfidf_vectorizer 
count_vect = TfidfVectorizer()
count_vect.fit(X_train)

# transform the training, validation data using count vectorizer object
xtrain_count =  count_vect.transform(X_train)
xval_count =  count_vect.transform(X_val)

In [0]:
'''
# this vectorization is specific for improved logistic regression
import re
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
  return re_tok.sub(r' \1 ', s).split()
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                      min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                      smooth_idf=1, sublinear_tf=1 )

xtrain_count = vec.fit_transform(X_train)
xval_count = vec.transform(X_val)
'''

### OneVsRestClassifier 
    - LogisticRegression
    - MultinomialNB
    - LGBMClassifier
    - Decision Tree

### Logistic Regression

In [0]:
clf_logreg = OneVsRestClassifier(LogisticRegression(solver='sag', C=4), n_jobs=1)
result=[]

for category in categories:
    print('... Processing {}'.format(category))

    pkl_filename = 'logreg_'+category+'_model.pkl'
    # get r
    #r = csr_matrix(np.log(pr(xtrain_count, 1, train[category].values) / pr(xtrain_count, 0, train[category].values)))

    # train the model using X_dtm & y
    clf_logreg.fit(hstack([xtrain_count,df_train_fe]), train[category])
    pickle.dump(clf_logreg, open(pkl_filename, "wb"))

    # compute the testing accuracy
    logreg_pred = clf_logreg.predict(hstack([xval_count,df_val_fe]))
    prediction = clf_logreg.predict_proba(hstack([xval_count,df_val_fe]))

    #print(prediction)
    result.append(roc_auc_score(val[category], prediction[:,1]))

print('Average : ', np.mean(result))

... Processing toxic
... Processing severe_toxic
... Processing obscene
... Processing threat
... Processing insult
... Processing identity_hate
Average :  0.9840221676440876


### Multinomial NB

In [0]:
from sklearn.naive_bayes import MultinomialNB

In [0]:
clf_nb = OneVsRestClassifier(MultinomialNB())
result=[]

for category in categories:
    print('... Processing {}'.format(category))

    # train the model using X_dtm & y
    clf_nb.fit(xtrain_count, train[category])

    # compute the testing accuracy
    nb_pred = clf_nb.predict(xval_count)
    prediction = clf_nb.predict_proba(xval_count)
    result.append(roc_auc_score(val[category], prediction[:,1]))


print('Average : ', np.mean(result))

... Processing toxic
... Processing severe_toxic
... Processing obscene
... Processing threat
... Processing insult
... Processing identity_hate
Average :  0.8446367983190326


### LightGBM

In [0]:
import lightgbm as lgb

In [0]:
clf_lgbm = OneVsRestClassifier(lgb.LGBMClassifier(random_state=7))
result=[]

for category in categories:
    print('... Processing {}'.format(category))

    # train the model using X_dtm & y
    clf_lgbm.fit(xtrain_count, train[category])

    # compute the testing accuracy
    lgbm_pred = clf_lgbm.predict(xval_count)
    prediction = clf_lgbm.predict_proba(xval_count)
    result.append(roc_auc_score(val[category], prediction[:,1]))

print('Average : ', np.mean(result))

... Processing toxic
... Processing severe_toxic
... Processing obscene
... Processing threat
... Processing insult
... Processing identity_hate
Average :  0.9469535691289264


### Decision Tree

In [0]:
from sklearn.tree import DecisionTreeClassifier

In [0]:
clf_dt = OneVsRestClassifier(DecisionTreeClassifier(random_state=7))
result=[]

for category in categories:
    print('... Processing {}'.format(category))

    # train the model using X_dtm & y
    clf_dt.fit(xtrain_count, train[category])

    # compute the testing accuracy
    dt_pred = clf_dt.predict(xval_count)
    prediction = clf_dt.predict_proba(xval_count)
    result.append(roc_auc_score(val[category], prediction[:,1]))

print('Average : ', np.mean(result))

... Processing toxic
... Processing severe_toxic
... Processing obscene
... Processing threat
... Processing insult
... Processing identity_hate
Average :  0.7385250126814928
