In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd
import xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


- loading the text only dataset

In [3]:
df = pd.read_csv("../dataset/train_text_only")
df.head()

Unnamed: 0,comment_text,target
0,"This is so cool. It's like, 'would you want yo...",0.0
1,Thank you!! This would make my life a lot less...,0.0
2,This is such an urgent design problem; kudos t...,0.0
3,Is this something I'll be able to install on m...,0.0
4,haha you guys are a bunch of losers.,0.893617


- converting labels to 0 and 1

In [4]:
def encode_label(l):
    if l < 0.5:
        return 0
    return 1

In [5]:
df['target'] = df['target'].apply(lambda x: 0 if x < 0.5 else 1)

In [6]:
df.head()

Unnamed: 0,comment_text,target
0,"This is so cool. It's like, 'would you want yo...",0
1,Thank you!! This would make my life a lot less...,0
2,This is such an urgent design problem; kudos t...,0
3,Is this something I'll be able to install on m...,0
4,haha you guys are a bunch of losers.,1


- Spilitting dataset to training and validation sets 

In [7]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['comment_text'], df['target'])

- vectorizing the text using simple word count

In [8]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df['comment_text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [9]:
x_train_count = count_vect.transform(train_x)

In [10]:
x_valid_count = count_vect.transform(valid_x)

- vectorizing the text using TF-IDF measure

In [11]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)

In [12]:
tfidf_vect.fit(df['comment_text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

- A function to train the model and calculate the accuracy on validation set

In [14]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    classifier.fit(feature_vector_train, label)
    predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, valid_y)

In [15]:
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)

In [16]:
accuracy

0.9237022377160536

- Preparing an output to sublit to Kaggle

In [18]:
test_df = pd.read_csv("../dataset/test.csv")
test_df.head()

Unnamed: 0,id,comment_text
0,7000000,Jeff Sessions is another one of Trump's Orwell...
1,7000001,I actually inspected the infrastructure on Gra...
2,7000002,No it won't . That's just wishful thinking on ...
3,7000003,Instead of wringing our hands and nibbling the...
4,7000004,how many of you commenters have garbage piled ...


In [19]:
xtest_tfidf =  tfidf_vect.transform(test_df['comment_text'])

In [20]:
clf = naive_bayes.MultinomialNB()

In [21]:
clf.fit(xtrain_tfidf, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
prediction = clf.predict(xtest_tfidf)

In [23]:
prediction

array([0, 0, 0, ..., 0, 0, 0])

In [24]:
test_df['prediction'] = prediction

In [25]:
test_df.head()

Unnamed: 0,id,comment_text,prediction
0,7000000,Jeff Sessions is another one of Trump's Orwell...,0
1,7000001,I actually inspected the infrastructure on Gra...,0
2,7000002,No it won't . That's just wishful thinking on ...,0
3,7000003,Instead of wringing our hands and nibbling the...,0
4,7000004,how many of you commenters have garbage piled ...,0


In [26]:
output_df = test_df[['id', 'prediction']]

In [28]:
output_df.to_csv("../dataset/submission.csv", index=False)