# Lesson 9. Интеграция. Итоговый проект

Определение вероятности наличия сердечно-сосудистых заболеваний по данным первичного осмотра 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head(3)

In [None]:
df.identity_hate.value_counts()

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
X = df.drop(columns=['insult', 'id'])
y = df.insult

In [None]:
continious_cols = X.select_dtypes(include='int64').columns
continious_cols

In [None]:
cat_cols = ['comment_text']

In [None]:
df['insult'].value_counts()

In [None]:
from sklearn.metrics import classification_report, precision_recall_curve, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler

Создаем классы обработки признаков

In [None]:
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[[self.key]]
    
    
class CatSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.key]

In [None]:
final_transformers = []

for cat_col in cat_cols:
    cat_transformer = Pipeline([
        ('selector', CatSelector(key=cat_col)),
        ('tfdf', TfidfVectorizer())
    ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for num_col in continious_cols:
    num_transformer = Pipeline([
        ('selector', NumberSelector(key=num_col))
    ])
    
    final_transformers.append((num_col, num_transformer))

In [None]:
feats = FeatureUnion(final_transformers)

In [None]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

Разбиение данных на обучение и тест

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=50, stratify=y)

Подбор гиперпараметров, кросс-валидация

In [None]:
# pipeline = Pipeline([
#     ('features', feats),
#     ('model', LGBMClassifier(random_state=1))
# ])

In [None]:
# pipeline.get_params()

In [None]:
# parameters = {
#     'model__reg_lambda': [0.001, 0.01, 100, 10, 1, 0.1],
#     'model__learning_rate': [0.001, 0.01, 0.1, 0.5, 1]
# #     'features__comment_text__tfdf__stop_words': ('english', None),
# #     'features__comment_text__tfdf__max_features': [10, 50, 100, 300, 500, 1000, None]
# }


In [None]:
# grid = GridSearchCV(pipeline, parameters, cv=3, verbose=1)
# grid.fit(X_train, y_train)

In [None]:
# grid.best_params_

In [None]:
# grid.best_score_

In [None]:
pipeline = Pipeline([
    ('features', feats),
    ('model', LGBMClassifier(random_state=1,
                             learning_rate=0.1,
                             reg_lambda=0.001))
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
train_pred = pipeline.predict(X_train)

In [None]:
print(classification_report(y_train, train_pred))

In [None]:
# сохранение модели

In [None]:
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [None]:
import dill

In [None]:
with open('logreg_pipeline.dill', 'wb') as f:
    dill.dump(pipeline, f)

Проверка модели

In [None]:
import pandas as pd
import numpy as np
import dill
from sklearn.metrics import roc_auc_score, precision_recall_curve

In [None]:
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [None]:
X_test.head(2)

In [None]:
with open('logreg_pipeline.dill', 'rb') as f:
    pipeline = dill.load(f)

In [None]:
pipeline

In [None]:
preds = pipeline.predict_proba(X_test)[:, 1]

In [None]:
print(roc_auc_score(y_test, preds))

In [None]:
precision, recall, threshold = precision_recall_curve(y_test, preds)
b = 1.5 # так как полнота важнее, чем точность в определении заболевания
f_score = (1 + b ** 2) * (precision * recall) / (b ** 2 * precision + recall)
idx = np.nanargmax(f_score)
print(f'b={b}, \
      \nthreshold = {threshold[idx]}, \
      \nprecision = {precision[idx]}, \
      \nrecall = {recall[idx]}, \
      \nf_score = {f_score[idx]}')

In [None]:
threshold[idx]

## Flask

In [8]:
from flask import Flask, request, jsonify

In [4]:
import pandas as pd
import dill

In [5]:
with open('logreg_pipeline.dill', 'rb') as f:
    model = dill.load(f)

In [None]:
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [6]:
X_test.sample(1)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,identity_hate
31825,There is a big difference between promotion an...,0,0,0,0,0


In [10]:
app = Flask(__name__)


@app.route('/', methods=['GET'])
def general():
    return 'Welcome to prediction process'

@app.route('/predict', methods=['POST'])
def predict():
    data = {'succes': False}
    
    comment_text, toxic, severe_toxic, obscene, threat, identity_hate = '', 0, 0, 0, 0, 0
    request_json = request.get_json()
    
    if request_json['comment_text']:
        comment_text = request_json['comment_text']
    
    if request_json['toxic']:
        toxic = request_json['toxic']
    
    if request_json['severe_toxic']:
        toxic = request_json['severe_toxic']
    
    if request_json['obscene']:
        toxic = request_json['obscene']
    
    if request_json['threat']:
        toxic = request_json['threat']
    
    if request_json['identity_hate']:
        toxic = request_json['identity_hate']
    
    
    preds = model.predict_proba(pd.DataFrame({'comment_text': [comment_text],
                                             'toxic': [toxic],
                                             'severe_toxic': [severe_toxic],
                                             'obscene': [obscene],
                                             'threat': [threat],
                                             'identity_hate': [identity_hate]}))
    
    data['predictions'] = preds[:, 1]
    data['comment_text'] = comment_text
    
    data['succes'] = True
    print('OK')
    
    return jsonify(data)

if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000 (Press CTRL+C to quit)
127.0.0.1 - - [19/Jul/2022 12:33:03] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [19/Jul/2022 12:33:10] "GET /prediction HTTP/1.1" 404 -
127.0.0.1 - - [19/Jul/2022 12:33:17] "GET /predict HTTP/1.1" 405 -
127.0.0.1 - - [19/Jul/2022 12:33:32] "GET /predict HTTP/1.1" 405 -
