In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords # Import the stop word list
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import random
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, confusion_matrix,accuracy_score

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [5]:
train_data['neutral'] = train_data.apply(lambda x: 0 if sum(x[2:8])>=1 else 1 ,axis = 1)
print(train_data.head())

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  neutral  
0             0        0       0       0              0        1  
1             0        0       0       0              0        1  
2             0        0       0       0              0        1  
3             0        0       0       0              0        1  
4             0        0       0       0              0        1  


In [6]:
def process_comment_text(txt):
    ntxt = re.sub(r"[^a-zA-Z]", " ", txt)
    ntxt = ntxt.lower()
    return ntxt

train_data['processed'] = train_data.comment_text.apply(process_comment_text)
test_data['processed'] = test_data.comment_text.apply(process_comment_text)
print(train_data.head())
print(test_data.head())

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  neutral  \
0             0        0       0       0              0        1   
1             0        0       0       0              0        1   
2             0        0       0       0              0        1   
3             0        0       0       0              0        1   
4             0        0       0       0              0        1   

                                           processed  
0  explanation why the edits made under my userna

# Tfidf Vectorizer and Regression - 1VsAll

In [7]:
def class_model(data,labels):
    negative_ind = np.where(labels == 0)[0]
    positive_ind = np.where (labels == 1)[0]
    balance_negative = np.random.choice(negative_ind,size = len(positive_ind),replace = False)
    train_ind = np.concatenate((positive_ind,balance_negative))
    random.shuffle(train_ind)
    data = data[train_ind,:]
    labels = labels[train_ind]
    
    
    model = LogisticRegression(C=100)
    model.fit(data, labels)
    return model


#vect = TfidfVectorizer(min_df=3,stop_words='english').fit(train_data.processed.values)
#x_train_vectorized = vect.transform(train_data.processed.values)
#x_test_vectorized = vect.transform(test_data.processed.values)

In [None]:
vect = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='char',
                       ngram_range=(1, 6),stop_words='english').fit(train_data.processed.values)
x_train_vectorized = vect.transform(train_data.processed.values)
x_test_vectorized = vect.transform(test_data.processed.values)

In [None]:
columns = ['id']
index = test_data.index # array of numbers for the number of samples
res = pd.DataFrame(columns=columns, index = index)
res['id'] = test_data['id']

for c in train_data.columns[2:8]:
    labels = np.array(train_data[c])
    mm = class_model(x_train_vectorized,labels)
    predictions = mm.predict_proba(x_test_vectorized)
    res[c] = predictions[:,np.squeeze(np.where(mm.classes_==1))]

In [None]:
print(res)

In [None]:
res.to_csv('tfid_reg.csv',index=False)