## Imports

In [93]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
import random
from nltk.corpus import words
import nltk
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

## Read data

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [7]:
train_data['neutral'] = train_data.apply(lambda x: 0 if sum(x[2:8])>=1 else 1 ,axis = 1)
train_data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,neutral
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0,1
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,1
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,1
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0,1


## Preprocess

In [57]:
def preprocess(data):
    data['processed'] = data['comment_text'].apply(lambda x: str.lower(x))
    data['processed'] = data['processed'].apply(lambda x: re.sub(r'[^a-zA-Z\s]|\n', r' NONCHAR ', x))

preprocess(train_data)
preprocess(test_data)

## Feature Extraction

In [118]:
vect = TfidfVectorizer(min_df=3,stop_words='english').fit(train_data.processed.values)
x_train_vectorized = vect.transform(train_data.processed.values)
x_test_vectorized = vect.transform(test_data.processed.values)

# length of comments
comlength_train = train_data.processed.str.len()
comlength_test = test_data.processed.str.len()
x_train_vectorized = hstack([x_train_vectorized, csr_matrix(comlength_train).T], 'csr')
x_test_vectorized = hstack([x_test_vectorized, csr_matrix(comlength_test).T], 'csr')

# number of special charachters (probbaly already part of tfidf)
numspecchar_train = train_data.processed.str.count(r' NONCHAR ')
numspecchar_test = test_data.processed.str.count(r' NONCHAR ')
x_train_vectorized = hstack([x_train_vectorized, csr_matrix(numspecchar_train).T], 'csr')
x_test_vectorized = hstack([x_test_vectorized, csr_matrix(numspecchar_test).T], 'csr')

# number of uppercase characters
numupper_train = train_data['comment_text'].apply(lambda x: len(re.sub(r'[^A-Z]','', x)))
numupper_test = test_data['comment_text'].apply(lambda x: len(re.sub(r'[^A-Z]','', x)))
x_train_vectorized = hstack([x_train_vectorized, csr_matrix(numupper_train).T], 'csr')
x_test_vectorized = hstack([x_test_vectorized, csr_matrix(numupper_test).T], 'csr')

# proportion of real words in comment (misspellings and neologisms)
#numspecchar_train = train_data.processed.apply(lambda x: np.mean([i in words.words() for i in nltk.word_tokenize(x) if i != 'NONCHAR']))
#numspecchar = test_data.processed.apply(lambda x: np.mean([i in words.words() for i in nltk.word_tokenize(x) if i != 'NONCHAR']))
#x_train_vectorized = hstack([x_train_vectorized, csr_matrix(numspecchar_train).T], 'csr')
#x_test_vectorized = hstack([x_test_vectorized, csr_matrix(numspecchar_test).T], 'csr')

In [119]:
print(x_train_vectorized.shape,x_test_vectorized.shape)

(159571, 50452) (153164, 50452)


## Define and Run Model

In [120]:
def class_model(data,labels):
    negative_ind = np.where(labels == 0)[0]
    positive_ind = np.where (labels == 1)[0]
    balance_negative = np.random.choice(negative_ind,size = len(positive_ind),replace = False)
    train_ind = np.concatenate((positive_ind,balance_negative))
    random.shuffle(train_ind)
    data = data[train_ind,:]
    labels = labels[train_ind]
    
    model = LogisticRegressionCV(
        Cs=list(np.power(10.0, np.arange(-10, 10)))
        ,penalty='l2'
        ,scoring='roc_auc'
        ,cv=5
        ,random_state=777
        ,max_iter=10000
        ,fit_intercept=True
        ,solver='newton-cg'
        ,tol=10
        ,refit = True
    )
    model.fit(data, labels)
    return model

In [121]:
columns = ['id']
index = test_data.index # array of numbers for the number of samples
res = pd.DataFrame(columns=columns, index = index)
res['id'] = test_data['id']
for c in train_data.columns[2:8]:
    labels = np.array(train_data[c])
    mm = class_model(x_train_vectorized,labels)
    predictions = mm.predict_proba(x_test_vectorized)
    res[c] = predictions[:,np.squeeze(np.where(mm.classes_==1))]

In [122]:
res

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999976,0.792764,0.999946,0.913802,0.980236,0.863706
1,0000247867823ef7,0.012908,0.184698,0.016563,0.037860,0.148614,0.214841
2,00013b17ad220c46,0.149292,0.166081,0.132662,0.043222,0.246954,0.226726
3,00017563c3f7919a,0.003006,0.104855,0.004856,0.071792,0.069203,0.124901
4,00017695ad8997eb,0.151507,0.171135,0.063297,0.111141,0.168149,0.264246
5,0001ea8717f6de06,0.029679,0.080482,0.005375,0.046922,0.114053,0.197948
6,00024115d4cbde0f,0.011794,0.094483,0.021293,0.021126,0.093405,0.198531
7,000247e83dcc1211,0.990763,0.271749,0.305970,0.321511,0.510670,0.487504
8,00025358d4737918,0.358209,0.184456,0.104500,0.132567,0.262115,0.274055
9,00026d1092fe71cc,0.019644,0.118415,0.017289,0.035386,0.116398,0.218762


In [123]:
res.to_csv('tfidplus_reg.csv',index=False)