In [85]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [86]:
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")


In [87]:
train

Unnamed: 0,Sentence,Bad Sentence
0,HOSPITAL BEDS Some aspects of the city's prepa...,0
1,END APMGPURUWUPUGPB,1
2,END NRAFLFVEFEIRFII,1
3,"In a separate transaction, VW said it will pay...",0
4,B 1 B 2: Financial Instruments with similar ec...,1
...,...,...
3995,By Chris Wack Aurora Cannabis Inc. shares were...,0
3996,Details of any open stock settled derivative p...,1
3997,"People want to know what ‚Äôs happening, and w...",0
3998,Forward looking statements include any stateme...,1


In [88]:
test

Unnamed: 0,Sentence,Bad Sentence
0,The performance of a benchmark shall not be in...,0
1,No media Q&A.,0
2,"PARIS, March 25( Reuters) - Essilor Luxottica ...",0
3,""" For over two decades Exela has been providin...",0
4,.................................................,1
...,...,...
995,AMEX RESUMED INVSCO ADV MPLII < VKI.A >,1
996,Short Link: https://tmsnrt.rs/37e DHOk Video T...,1
997,Following is a list of Massachusetts drivers i...,0
998,Changes in the rates of exchange between curre...,1


In [89]:
from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [90]:
clean_text(train.iloc[0,0])

'hospit bed aspect citi prepared remain unknown'

In [91]:
df_x = train['Sentence']
df_y = train['Bad Sentence']

In [92]:
df_x

0       HOSPITAL BEDS Some aspects of the city's prepa...
1                                     END APMGPURUWUPUGPB
2                                     END NRAFLFVEFEIRFII
3       In a separate transaction, VW said it will pay...
4       B 1 B 2: Financial Instruments with similar ec...
                              ...                        
3995    By Chris Wack Aurora Cannabis Inc. shares were...
3996    Details of any open stock settled derivative p...
3997    People want to know what ‚Äôs happening, and w...
3998    Forward looking statements include any stateme...
3999    Industrial & commercial accounted for 52% of B...
Name: Sentence, Length: 4000, dtype: object

In [93]:
df_y

0       0
1       1
2       1
3       0
4       1
       ..
3995    0
3996    1
3997    0
3998    1
3999    0
Name: Bad Sentence, Length: 4000, dtype: int64

In [94]:
from sklearn.base import BaseEstimator

In [95]:
class Text2TfIdfTransformer(BaseEstimator):
    def __init__(self):
        self._model = TfidfVectorizer()
        pass

    def fit(self, df_x, df_y=None):
        df_x = df_x.apply(lambda x : clean_text(x))
        self._model.fit(df_x)
        return self

    def transform(self, df_x):
        return self._model.transform(df_x)


In [96]:
tfidf_transformer = Text2TfIdfTransformer()
tfidf_vectors = tfidf_transformer.fit(df_x).transform(df_x)

In [97]:
tfidf_vectors.shape

(4000, 8164)

In [98]:
print(tfidf_vectors)

  (0, 7617)	0.7924099539714787
  (0, 5953)	0.6099889055113372
  (1, 2294)	0.39479173280371993
  (1, 334)	0.9187706393381517
  (2, 4963)	0.9187706393381517
  (2, 2294)	0.39479173280371993
  (3, 7960)	0.35176664119633005
  (3, 7192)	0.2625150460459242
  (3, 6310)	0.16800984223155385
  (3, 4318)	0.26824633954733523
  (3, 3722)	0.35176664119633005
  (3, 3515)	0.3691695866271746
  (3, 3245)	0.2293290057097024
  (3, 3094)	0.3691695866271746
  (3, 1356)	0.35176664119633005
  (3, 738)	0.2165706895470998
  (3, 730)	0.2960001882516001
  (4, 8053)	0.379335416025381
  (4, 7531)	0.2859929595125821
  (4, 6623)	0.32279706414257475
  (4, 6505)	0.31924221338475905
  (4, 5335)	0.25448820820448337
  (4, 5097)	0.34608616215320404
  (4, 4973)	0.19192185303476203
  (4, 3611)	0.28773746207666034
  :	:
  (3996, 6454)	0.2176101993868852
  (3996, 5097)	0.6534068761890128
  (3996, 5079)	0.4755743969808578
  (3996, 5067)	0.37111738028976304
  (3996, 2931)	0.2852119313163929
  (3996, 2716)	0.24023547216577315
  (3

In [99]:
pl_xgb_tf_idf = Pipeline(steps=[('tfidf',Text2TfIdfTransformer()),
                         ('xgboost', xgb.XGBClassifier(objective='binary:hinge'))])

In [102]:
param = {
    'n_estimators': [100, 150],
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

pl_xgb_tf_idf = Pipeline(steps=[('tfidf',Text2TfIdfTransformer()),
                         ('xgboost', GridSearchCV(xgb.XGBClassifier(objective='binary:hinge'),
                                                  param_grid=param,
                                                   cv=5,
                                                   n_jobs=-1,
                                                   scoring=f1_score
                                                 ))])

In [103]:

# clf = GridSearchCV(estimator=pl_xgb_tf_idf,
#                    param_grid={},
#                    cv=10,
#                    n_jobs=-1,
#                    scoring=f1_score)
cv_fit = pl_xgb_tf_idf.fit(df_x, df_y)



0       0
1       1
2       1
3       0
4       1
       ..
3995    0
3996    1
3997    0
3998    1
3999    0
Name: Bad Sentence, Length: 4000, dtype: int64

In [105]:
cv_fit

Pipeline(steps=[('tfidf', Text2TfIdfTransformer()),
                ('xgboost',
                 GridSearchCV(cv=5,
                              estimator=XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                      max_depth=None,
                                                     

In [110]:
cv_fit.get_params()

{'memory': None,
 'steps': [('tfidf', Text2TfIdfTransformer()),
  ('xgboost', GridSearchCV(cv=5,
                estimator=XGBClassifier(base_score=None, booster=None,
                                        colsample_bylevel=None,
                                        colsample_bynode=None,
                                        colsample_bytree=None, gamma=None,
                                        gpu_id=None, importance_type='gain',
                                        interaction_constraints=None,
                                        learning_rate=None, max_delta_step=None,
                                        max_depth=None, min_child_weight=None,
                                        missing=nan, monotone_constraints=None,
                                        n_estimators=100, n_jobs=None,
                                        num_parallel_tree=None,
                                        objective='binary:hinge',
                                        ra