In [164]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import roc_auc_score, confusion_matrix,accuracy_score
import matplotlib
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix, vstack, lil_matrix
from os.path import isfile
import h5py
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import load_model
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from scipy import sparse, io

%matplotlib inline

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [33]:
def process_comment_text(txt):
    ntxt = re.sub(r"[^a-zA-Z]", " ", txt)
    ntxt = ntxt.lower()
    lemmatizer = WordNetLemmatizer()
    text = nltk.word_tokenize(ntxt)
    return ' '.join([lemmatizer.lemmatize(w,'v') for w in text])

train_data['processed'] = train_data.comment_text.apply(process_comment_text)
test_data['processed'] = test_data.comment_text.apply(process_comment_text)
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,processed
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edit make under my usernam...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d aww he match this background colour i m seem...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man i m really not try to edit war it s ju...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,more i can t make any real suggestions on impr...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir be my hero any chance you remember wha...


# Make features for all data

In [37]:
# see how frequent frequent words are
stop = set(stopwords.words('english'))
text = nltk.word_tokenize(' '.join(train_data.processed.values))
dist = nltk.FreqDist(text)
freq=[(w, dist[w]) for w in sorted(dist, key=dist.get, reverse=True) if w not in stop]

[('article', 74553),
 ('page', 57296),
 ('wikipedia', 48625),
 ('edit', 40862),
 ('talk', 40500),
 ('use', 33257),
 ('make', 30575),
 ('please', 29976),
 ('would', 29323),
 ('one', 29199),
 ('like', 28739),
 ('think', 25743),
 ('see', 25574),
 ('say', 25520),
 ('know', 24313),
 ('source', 23941),
 ('thank', 23896),
 ('get', 22857),
 ('go', 21878),
 ('also', 20643)]

In [None]:
print(freq[:20])
print(freq[-20:])

In [38]:
def getwordlist(data,minfreq,maxfreq):
    stop = set(stopwords.words('english'))
    text = nltk.word_tokenize(' '.join(data))
    dist = nltk.FreqDist(text)
    wordlist = [i for i in dist.keys() if dist[i]>minfreq and dist[i]<maxfreq and i not in stop]
    return wordlist

words = getwordlist(data=train_data.processed.values,minfreq=5,maxfreq=50000)

In [168]:
def make_features(data,name):
    X = csr_matrix((0, len(words)))

    # for every comment check if the words corresponding to our input vector exist
    count=0
    for i in data.processed.loc[:1000]:
        if count%1000==0:
            print(round(((count+1)/len(data.processed))*100,3),'%')
        cw = set(nltk.word_tokenize(i))
        add = [int(w in cw) for w in words]
        X = vstack([X, csr_matrix(add)], 'csr')
        count+=1
        
    y = data.loc[:1000].apply(lambda x: x[2:8],axis=1) # csr_matrix()?
    y = csr_matrix(y.values)
    
    io.mmwrite(name+'.mtx', X)
    io.mmwrite(name+'.mtx', y)
    
    # X is shape (number of examples in the data) x (numer of feature words) 

In [167]:
make_features(train_data,'train_data')
make_features(test_data,'test_data')

0.001 %
0.627 %
0.001 %
0.654 %


TypeError: no supported conversion for types: (dtype('O'),)

## Load data

In [None]:
X_train = io.mmread('train_data_X.mtx').tocsr()
y_train = io.mmread('train_data_y.mtx').tocsr()
X_test = io.mmread('test_data_X.mtx').tocsr()
y_test = io.mmread('test_data_y.mtx').tocsr()

## Specify model

In [None]:
nout = y_train.shape[1]
nin = len(words)

model = Sequential()
model.add(Dense(100, input_dim=nin, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(nout, activation='softmax'))

In [144]:
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

## Fit model

In [145]:
# to deal with sparse matrix: https://stackoverflow.com/questions/41538692/using-sparse-matrices-with-keras-and-tensorflow
def batch_generator(x_source, y_source, size):
    while True:
        for i in range(0, x_source.shape[0], size):
            j = i + size
            
            if j > x_source.shape[0]:
                j = x_source.shape[0]
                
            yield x_source[i:j].toarray(), y_source[i:j].toarray()

nb_batch=64
nb_epoch=10
            
model.fit_generator(batch_generator(X_train, y_train, nb_batch),X_train.shape[0], nb_epoch, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff310c73518>

## Check Predictions

In [156]:
score = model.evaluate_generator(batch_generator(X_test, y_test, nb_batch),len(range(0, X_test.shape[0], nb_batch)))

In [157]:
score

[0.7027327078747084, 0.553784866969424]

In [159]:
pred = model.predict_generator(batch_generator(X_test, y_test, nb_batch),len(range(0, X_test.shape[0], nb_batch)))

In [161]:
pred.shape

(251, 6)

## Save Results

In [17]:
res.to_csv('tfid_reg.csv',index=False)