## Подключение библиотек<a class="anchor" id="load_libs"></a>

**Используемые библиотеки**

In [1]:
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost
# !pip install imbalanced-learn
# !pip install wordcloud
# !pip install pymorphy2
# !pip install pyaspeller

In [2]:
# Libraries for working with paths and saving
from pathlib import Path
# import os
import dill


# Scientific libraries
import numpy as np
import pandas as pd
import re

# Visual libraries
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
matplotlib.rcParams.update({'font.size': 14})
plt.style.use('seaborn')

# Scikit-learn libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, roc_auc_score,roc_curve
from sklearn.model_selection import cross_val_score, GridSearchCV, learning_curve
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.base import BaseEstimator, TransformerMixin

import pandarallel



# Language libraries
from wordcloud import WordCloud, STOPWORDS
import pymorphy2
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
# nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from pyaspeller import YandexSpeller

import xgboost as xgb
import catboost as catb
import inspect

In [3]:
# print(f'Версия xgboost: {xgb.__version__}')
# !python --version

In [4]:
import warnings
warnings.filterwarnings('ignore')

## Используемые константы<a class="anchor" id="used_constants"></a>

In [5]:
# Список используемых переменных констант

RANDOM_STATE = 42

## Используемые функции<a class="anchor" id="used_functions"></a>

## Загрузка исходных данных<a class="anchor" id="load_data"></a>

### Пути к директориям и файлам<a class="anchor" id="path_data"></a>

In [6]:
DATA_ROOT = Path('../')
MODELS_PATH = Path('../2_models/')

# input
TRAIN_DATASET_PATH = DATA_ROOT / '1_input_data/train.tsv'
TEST_DATASET_PATH = DATA_ROOT / '1_input_data/test.tsv'

# output

MODEL_FILE_PATH = MODELS_PATH / 'model.pkl'

### Загрузка данных<a class="anchor" id="load_dataset"></a>

**Описание базового датасета**

* **title** - заголовок новости
* **is_fake** - метка: 0 – новость реальная; 1 – новость выдуманная

In [7]:
df_train = pd.read_csv(TRAIN_DATASET_PATH, sep='\t')
df_train.head()

Unnamed: 0,title,is_fake
0,Москвичу Владимиру Клутину пришёл счёт за вмеш...,1
1,Агент Кокорина назвал езду по встречке житейск...,0
2,Госдума рассмотрит возможность введения секрет...,1
3,ФАС заблокировала поставку скоростных трамваев...,0
4,Против Навального завели дело о недоносительст...,1


In [8]:
print('Тренировочный датасет df_train:', df_train.shape)

Тренировочный датасет df_train: (5758, 2)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_train, df_train['is_fake'], test_size=0.1, random_state=RANDOM_STATE)

#save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)
#save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [10]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class TextImputer(BaseEstimator, TransformerMixin):   
    def __init__(self, key, value):
        self.key = key
        self.value = value
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X

In [11]:
features = 'title'
target = 'is_fake'

### Feature engineering

In [12]:
title = Pipeline([
                ('imputer', TextImputer('title', '')),                
                ('selector', ColumnSelector(key='title')),
                ('tfidf', TfidfVectorizer(max_df=0.9, min_df=10))])
    
feats = FeatureUnion([('title', title)])

In [13]:
%%time

pipeline = Pipeline([
    ('features',feats),
    ('classifier', LogisticRegression(random_state = RANDOM_STATE)),
])

pipeline.fit(X_train, y_train)

CPU times: user 125 ms, sys: 3.62 ms, total: 128 ms
Wall time: 139 ms


Сохраним модель (пайплайн)

In [14]:
with open("logreg_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)