# Lesson 9. Интеграция. Итоговый проект

Определение вероятности наличия сердечно-сосудистых заболеваний по данным первичного осмотра 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


In [4]:
df.identity_hate.value_counts()

0    158166
1      1405
Name: identity_hate, dtype: int64

In [5]:
df.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [7]:
X = df.drop(columns=['insult', 'id'])
y = df.insult

In [8]:
continious_cols = X.select_dtypes(include='int64').columns
continious_cols

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'identity_hate'], dtype='object')

In [9]:
cat_cols = ['comment_text']

In [10]:
df['insult'].value_counts()

0    151694
1      7877
Name: insult, dtype: int64

In [11]:
from sklearn.metrics import classification_report, precision_recall_curve, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler

Создаем классы обработки признаков

In [12]:
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[[self.key]]
    
    
class CatSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.key]

In [13]:
final_transformers = []

for cat_col in cat_cols:
    cat_transformer = Pipeline([
        ('selector', CatSelector(key=cat_col)),
        ('tfdf', TfidfVectorizer())
    ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for num_col in continious_cols:
    num_transformer = Pipeline([
        ('selector', NumberSelector(key=num_col))
    ])
    
    final_transformers.append((num_col, num_transformer))

In [14]:
feats = FeatureUnion(final_transformers)

In [15]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=50, stratify=y)

In [17]:
# pipeline = Pipeline([
#     ('features', feats),
#     ('model', LGBMClassifier(random_state=1))
# ])

In [18]:
# pipeline.get_params()

{'memory': None,
 'steps': [('features',
   FeatureUnion(transformer_list=[('comment_text',
                                   Pipeline(steps=[('selector',
                                                    CatSelector(key='comment_text')),
                                                   ('tfdf', TfidfVectorizer())])),
                                  ('toxic',
                                   Pipeline(steps=[('selector',
                                                    NumberSelector(key='toxic'))])),
                                  ('severe_toxic',
                                   Pipeline(steps=[('selector',
                                                    NumberSelector(key='severe_toxic'))])),
                                  ('obscene',
                                   Pipeline(steps=[('selector',
                                                    NumberSelector(key='obscene'))])),
                                  ('threat',
                                 

In [19]:
parameters = {
    'model__reg_lambda': [0.001, 0.01, 100, 10, 1, 0.1],
    'model__learning_rate': [0.001, 0.01, 0.1, 0.5, 1]
#     'features__comment_text__tfdf__stop_words': ('english', None),
#     'features__comment_text__tfdf__max_features': [10, 50, 100, 300, 500, 1000, None]
}


In [20]:
# grid = GridSearchCV(pipeline, parameters, cv=3, verbose=1)
# grid.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [21]:
# grid.best_params_

{'model__learning_rate': 0.1, 'model__reg_lambda': 0.001}

In [22]:
# grid.best_score_

0.9783335269015311

In [34]:
pipeline = Pipeline([
    ('features', feats),
    ('model', LGBMClassifier(random_state=1,
                            learning_rate=0.1,
                            reg_lambda=0.001))
])

In [35]:
pipeline.fit(X_train, y_train)

In [36]:
train_pred = pipeline.predict(X_train)

In [37]:
print(classification_report(y_train, train_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    113770
           1       0.87      0.83      0.85      5908

    accuracy                           0.99    119678
   macro avg       0.93      0.91      0.92    119678
weighted avg       0.99      0.99      0.99    119678



In [38]:
# сохранение модели

In [39]:
preds = pipeline.predict_proba(X_test)[:, 1]
y_pred = pipeline.predict(X_test)

In [40]:
print(roc_auc_score(y_test, preds))

0.9894302585015531


In [41]:
precision, recall, threshold = precision_recall_curve(y_test, preds)
b = 1
f_score = (1 + b ** 2) * (precision * recall) / (b ** 2 * precision + recall)
idx = np.nanargmax(f_score)
print(f'b={b}, threshold = {threshold[idx]}, \
      \nprecision = {precision[idx]}, \
      \nrecall = {recall[idx]}, \
      \nf_score = {f_score[idx]}')

b=1, threshold = 0.41278958033146573,       
precision = 0.7640287769784173,       
recall = 0.8090401218892839,       
f_score = 0.7858904785397137


In [42]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     37924
           1       0.80      0.77      0.78      1969

    accuracy                           0.98     39893
   macro avg       0.89      0.88      0.89     39893
weighted avg       0.98      0.98      0.98     39893



In [43]:
import dill

In [44]:
with open('logreg_pipeline.dill', 'wb') as f:
    dill.dump(pipeline, f)