In [36]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

In [7]:
#load datasets
trainset = pd.read_csv("train.csv")
testset = pd.read_csv("test.csv",)

In [13]:
#create sets
text = trainset["comment_text"]
toxic_score = trainset["toxic"]
severe_toxic_score = trainset["severe_toxic"]
obscene_score = trainset["obscene"]
threat_score = trainset["threat"]
insult_score = trainset["insult"]
identity_hate_score = trainset["identity_hate"]

In [70]:
#preprocess
toxic_score.value_counts()

0    86614
1     9237
Name: toxic, dtype: int64

In [32]:
#pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
grid_search.fit(text, severe_toxic_score)


print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  9.8min finished


Best score: 0.990
Best parameters set:
	clf__alpha: 1e-05
	clf__penalty: 'elasticnet'
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)


In [64]:
#train a SGD classifier  - Toxic
toxic_count_vect = TfidfVectorizer(max_df = 0.5,ngram_range=(1,1))
X = toxic_count_vect.fit_transform(text)
y = toxic_score
ToxicEstimator = SGDClassifier(penalty="elasticnet",alpha=0.000005,loss="log")

#completely fit
ToxicEstimator.fit(X,y)

SGDClassifier(alpha=5e-06, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [77]:
#train a SGD classifier  - severe_toxic
severe_count_vect = TfidfVectorizer(max_df = 0.75,ngram_range=(1,2))
X = severe_count_vect.fit_transform(text)
y = severe_toxic_score
SevToxicEstimator = SGDClassifier(penalty="elasticnet",alpha=0.000005,loss="log")

#completely fit
SevToxicEstimator.fit(X,y)

SGDClassifier(alpha=5e-06, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [86]:
#train a SGD classifier  - obscene
obscene_count_vect = TfidfVectorizer(max_df = 0.75,ngram_range=(1,2))
X = obscene_count_vect.fit_transform(text)
y = obscene_score
obsceneEstimator = SGDClassifier(penalty="elasticnet",alpha=0.000005,loss="log")

#completely fit
obsceneEstimator.fit(X,y)

SGDClassifier(alpha=5e-06, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [92]:
#train a SGD classifier  - threat
threat_count_vect = TfidfVectorizer(max_df = 0.75,ngram_range=(1,2))
X = threat_count_vect.fit_transform(text)
y = threat_score
threatEstimator = SGDClassifier(penalty="elasticnet",alpha=0.000005,loss="log")

#completely fit
threatEstimator.fit(X,y)

SGDClassifier(alpha=5e-06, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [95]:
#train a SGD classifier  - insult
insult_count_vect = TfidfVectorizer(max_df = 0.75,ngram_range=(1,2))
X = insult_count_vect.fit_transform(text)
y = insult_score
insultEstimator = SGDClassifier(penalty="elasticnet",alpha=0.000005,loss="log")

#completely fit
insultEstimator.fit(X,y)

SGDClassifier(alpha=5e-06, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [99]:
#train a SGD classifier  - id
id_count_vect = TfidfVectorizer(max_df = 0.75,ngram_range=(1,2))
X = id_count_vect.fit_transform(text)
y = identity_hate_score
idEstimator = SGDClassifier(penalty="elasticnet",alpha=0.000005,loss="log")

#completely fit
idEstimator.fit(X,y)

SGDClassifier(alpha=5e-06, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [106]:
def get_output(id_num,text):
    #vectorize
    text = [text]
    toxic_vec = toxic_count_vect.transform(text)
    sev_toxic_vec = severe_count_vect.transform(text)
    obscene_vec = obscene_count_vect.transform(text)
    threat_vec = threat_count_vect.transform(text)
    insult_vec = insult_count_vect.transform(text)
    id_vec = id_count_vect.transform(text)
    #predict
    toxic_value = "{0:.1f}".format(ToxicEstimator.predict_proba(toxic_vec)[0][1])
    sev_toxic_value = "{0:.1f}".format(SevToxicEstimator.predict_proba(sev_toxic_vec)[0][1])   
    obs_value = "{0:.1f}".format(obsceneEstimator.predict_proba(obscene_vec)[0][1])
    threat_value = "{0:.1f}".format(threatEstimator.predict_proba(threat_vec)[0][1])
    insult_value = "{0:.1f}".format(threatEstimator.predict_proba(insult_vec)[0][1])
    id_value = "{0:.1f}".format(idEstimator.predict_proba(id_vec)[0][1])
    return(str(id_num)+","+toxic_value +","+sev_toxic_value+","+obs_value+","+threat_value+","+insult_value+","+id_value)

get_output(1,"Is non other than an ungraceful dick!")

'1,0.9,0.0,0.6,0.0,0.0,0.0'

In [None]:
from_zero = False
start_from = 52300
if from_zero:
    f = open('present.csv','w')
    f.write('id,toxic,severe_toxic,obscene,threat,insult,identity_hate')
    for item in testset.iterrows():
        id_num =item[1][0]
        text = item[1][1]
        result = get_output(id_num,str(text))
        f.write('\n')
        f.write(result)
    f.close()
else:
    f = open('present.csv','a')
    for item in testset.iloc[start_from:].iterrows():
        id_num =item[1][0]
        text = item[1][1]
        result = get_output(id_num,str(text))
        f.write('\n')
        f.write(result)
    f.close()


In [144]:
testset.iloc[52296:].head()
get_output(1,'N/A')
#testset.iloc[52300][1]

'1,0.1,0.0,0.0,0.0,0.0,0.0'