# Lesson 9. Интеграция. Итоговый проект

Определение вероятности наличия сердечно-сосудистых заболеваний по данным первичного осмотра 

In [530]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [531]:
df = pd.read_csv('train.csv')

In [532]:
df.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


In [533]:
df.identity_hate.value_counts()

0    158166
1      1405
Name: identity_hate, dtype: int64

In [534]:
df.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [535]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [536]:
X = df.drop(columns='insult')
y = df.insult

In [537]:
continious_cols = X.select_dtypes(include='int64').columns
continious_cols

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'identity_hate'], dtype='object')

In [538]:
cat_cols = ['comment_text']

In [539]:
df['insult'].value_counts()

0    151694
1      7877
Name: insult, dtype: int64

In [540]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [541]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, precision_recall_curve, roc_auc_score

In [542]:
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator

In [543]:
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[[self.key]]
    
    
class CatSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.key]

In [588]:
final_transformers = []

for cat_col in cat_cols:
    cat_transformer = Pipeline([
        ('selector', CatSelector(key=cat_col)),
        ('tfdf', TfidfVectorizer(stop_words='english', max_features=20))
    ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for num_col in continious_cols:
    num_transformer = Pipeline([
        ('selector', NumberSelector(key=num_col))
    ])
    
    final_transformers.append((num_col, num_transformer))

In [589]:
feats = FeatureUnion(final_transformers)

In [590]:
from catboost import CatBoostClassifier

In [591]:
pipeline = Pipeline([
    ('features', feats),
    ('model', CatBoostClassifier(random_state=1,
                                 learning_rate=0.01,
                                 n_estimators=100,
                                 silent=True))
])

In [592]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5, stratify=y)

In [593]:
pipeline.fit(X_train, y_train)

In [594]:
preds = pipeline.predict_proba(X_test)[:, 1]
train_pred = pipeline.predict(X_train)
y_pred = pipeline.predict(X_test)

In [595]:
print(roc_auc_score(y_test, preds))

0.9761967333667629


In [596]:
precision, recall, threshold = precision_recall_curve(y_test, preds)
b = 1
f_score = (1 + b ** 2) * (precision * recall) / (b ** 2 * precision + recall)
idx = np.nanargmax(f_score)
print(f'b={b}, threshold = {threshold[idx]}, \
      \nprecision = {precision[idx]}, \
      \nrecall = {recall[idx]}, \
      \nf_score = {f_score[idx]}')

b=1, threshold = 0.3286307080955003,       
precision = 0.7394957983193278,       
recall = 0.8044692737430168,       
f_score = 0.7706154220384335


In [597]:
print(classification_report(y_train, train_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    113770
           1       0.75      0.76      0.75      5908

    accuracy                           0.98    119678
   macro avg       0.87      0.87      0.87    119678
weighted avg       0.98      0.98      0.98    119678



In [598]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     37924
           1       0.77      0.76      0.77      1969

    accuracy                           0.98     39893
   macro avg       0.88      0.88      0.88     39893
weighted avg       0.98      0.98      0.98     39893

