In [1]:
import pandas as pd
import numpy as np
import dill

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv('train.csv')

In [3]:
num_cols = list(df.select_dtypes(include='number').columns)

In [4]:
df['target'] = df[num_cols].max(axis=1).values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df[['id', 'comment_text']], df['target'],
                                                    random_state=42)

In [6]:
X_train

Unnamed: 0,id,comment_text
75144,c912439967ba8a35,", I have added the newline sign back in, you ..."
154769,b312f612d3394d5b,"Don't worry, I think I fixed it. But still loo..."
48346,813c210bf7f27377,"""\nWell it seems to me that you have a problem..."
72576,c231999bc75dcd9e,Contents\nThe contents box just appears when t...
79974,d60a1de68cf593c1,""" \n\n(Btw., did you know the article links to..."
...,...,...
119879,811ed72c51830f42,REDIRECT Talk:John Loveday (experimental physi...
103694,2acc7c7d0386401f,Back it up. Post the line here with the refere...
131932,c1f95b89050a9ee4,I won't stop that. Sometimes Germanic equals G...
146867,32e8bdecfe1d66f0,"""\n\n British Bands? \n\nI think you've mista..."


In [7]:
X_test.to_csv('X_test.csv', index=False)

In [8]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.column]

In [9]:
pipeline = Pipeline([('comment_text_selector', FeatureSelector(column='comment_text')),
                       ('comment_text_tfidf', TfidfVectorizer(strip_accents='unicode',
                                                             analyzer='word',
                                                             token_pattern=r'\w{1,}',
                                                             stop_words='english',
                                                             max_features=10000)),
                       ('clf', LogisticRegression())])
#                        ('clf', XGBClassifier(verbose=False))])

In [10]:
pipeline.fit(X_train, y_train)
y_score = pipeline.predict_proba(X_test)[:, 1]

In [11]:
b = 1
precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)

In [12]:
f_score = (1 + b**2) * (precision * recall) / (b**2 * precision + recall)
ix = np.argmax(f_score)
print(f'Best Threshold={thresholds[ix]}, F-Score={f_score[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.2929884260176707, F-Score=0.786, Precision=0.840, Recall=0.737


In [None]:
with open('xgboost_pipeline.dill', 'wb') as f:
    dill.dump(pipeline, f)