In [1]:
# package import
import copy
import random
import operator
import os, math
import re, string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from os import path
from PIL import Image
from stop_words import get_stop_words
from collections import Counter, defaultdict
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
from nltk.corpus import stopwords 

import torch
import torch.utils.data as tud
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# read in the data 
def load_data(data_file):
    return pd.read_csv(data_file)

In [3]:
# prepare training set, testing set, and result submission set 
train = load_data("train.csv")
test = load_data("test.csv")
result = load_data("sample_submission.csv")

In [4]:
# define 6 labels
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [5]:
class baseline_model:
    def __init__(self, train, test, result):
    
        self.train = train
        self.test = test
        self.result = result 

        
    def pre_processing(self):
        
        # data preprocessing: create a none column if a text comment is not labels with toxicity 
        self.train['none'] = 1-self.train[labels].max(axis = 1)
        # data precessing: fill in "unknown" for rows with missing comments 
        self.train['comment_text'].fillna("unknown", inplace = True)
        self.test['comment_text'].fillna("unknown", inplace = True)
 
        # return processed training set, and testing set
        return self.train, self.test


    def build_feature(self):
        
        self.train, self.test = self.pre_processing()
        
        # create a set of punctuations, numbers, and special characters
        re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])1234567890')
        
        # create bag of words representation
        def tokenize(s): 
            return re_tok.sub(r' \1 ', s).split()
            
        n = train.shape[0]
        
        # use sklearn tfidf to compute weights and implement feature generation   
        vec = TfidfVectorizer(ngram_range = (1,2), 
                              tokenizer = tokenize,
                              min_df = 3, 
                              max_df = 0.9, 
                              strip_accents = 'unicode', 
                              use_idf = 1,
                              smooth_idf = 1, 
                              sublinear_tf = 1 )
        
        # create two sparse matrix where contain small number of non-zero elements
        self.train_doc = vec.fit_transform(self.train['comment_text'])
        self.test_doc = vec.transform(self.test['comment_text'])
        
        return self.train_doc, self.test_doc
    

    def build_model(self):
        
        self.train_doc, self.test_doc = self.build_feature()
        
        # smoothing with naive base
        def ratio(y_i, y):
            p = self.train_doc[y == y_i].sum(0)
            return (p + 1)/((y == y_i).sum() + 1)
        
        # fit the model with logistic regression 
        def get_model(y):
            y = y.values
            r = np.log(ratio(1,y) / ratio(0,y))
            model = LogisticRegression(C = 4, 
                                       dual = True)
            x_nb = self.train_doc.multiply(r)
            return model.fit(x_nb, y), r

        self.preds = np.zeros((len(self.test), len(labels)))
        
        # fit one label a time 
        for i, j in enumerate(labels):
            print('fit', j)
        
            model,r = get_model(self.train[j])
            self.preds[:,i] = model.predict_proba(self.test_doc.multiply(r))[:,1]
            
        return self.preds
    
    
    def output(self):
        
        self.preds = self.build_model()
        
        # create the submission file 
        submission = pd.DataFrame({'id': self.result["id"]})
        result_submission = pd.concat([submission, pd.DataFrame(self.preds, 
                                                                columns = labels)], axis=1)
        result_submission.to_csv('NBSVM_submission.csv', index = False)

In [6]:
baseline = baseline_model(train, test, result)

In [7]:
baseline.output()

fit toxic




fit severe_toxic




fit obscene
fit threat
fit insult
fit identity_hate
