In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from collections import Counter

from sklearn.model_selection import KFold
from sklearn.metrics import (
    confusion_matrix, mean_squared_error, roc_auc_score
)
from sklearn.model_selection import GridSearchCV, cross_val_score

import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier, DMatrix

from catboost import CatBoostRegressor, Pool, CatBoostClassifier
from nltk.corpus import stopwords 


In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [4]:
labels = train_df.columns[3:]
labels

Index(['Computer Science', 'Physics', 'Mathematics', 'Statistics',
       'Quantitative Biology', 'Quantitative Finance'],
      dtype='object')

In [5]:
train_df.dtypes

ID                       int64
TITLE                   object
ABSTRACT                object
Computer Science         int64
Physics                  int64
Mathematics              int64
Statistics               int64
Quantitative Biology     int64
Quantitative Finance     int64
dtype: object

In [6]:
train_df.isnull().values.any()

False

**Вывод.** У нас нету пропусков в данных. 

Посмотрим на доли классов и кол-во данных.  

In [7]:
train_df.describe()

Unnamed: 0,ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
count,20972.0,20972.0,20972.0,20972.0,20972.0,20972.0,20972.0
mean,10486.5,0.409784,0.286716,0.267881,0.248236,0.02799,0.011873
std,6054.239259,0.491806,0.452238,0.442866,0.432,0.164947,0.108317
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5243.75,0.0,0.0,0.0,0.0,0.0,0.0
50%,10486.5,0.0,0.0,0.0,0.0,0.0,0.0
75%,15729.25,1.0,1.0,1.0,0.0,0.0,0.0
max,20972.0,1.0,1.0,1.0,1.0,1.0,1.0


**Вывод.** Видим, что классов `Quantitative Biology, Quantitative Finance` на порядок меньше. 

Посмотрим на кол-во слов в `abstract, title`.

In [8]:
train_df['abstract_words'] = train_df['ABSTRACT'].apply(lambda x: len(str(x).split()))
train_df['title_words'] = train_df['TITLE'].apply(lambda x: len(str(x).split()))

In [26]:
train_df.describe()

Unnamed: 0,ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,abstract_words,title_words
count,20972.0,20972.0,20972.0,20972.0,20972.0,20972.0,20972.0,20972.0,20972.0
mean,10486.5,0.409784,0.286716,0.267881,0.248236,0.02799,0.011873,148.404873,9.514972
std,6054.239259,0.491806,0.452238,0.442866,0.432,0.164947,0.108317,60.781306,3.611493
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,5243.75,0.0,0.0,0.0,0.0,0.0,0.0,104.0,7.0
50%,10486.5,0.0,0.0,0.0,0.0,0.0,0.0,145.0,9.0
75%,15729.25,1.0,1.0,1.0,0.0,0.0,0.0,190.0,12.0
max,20972.0,1.0,1.0,1.0,1.0,1.0,1.0,449.0,40.0


In [10]:
# import nltk
# nltk.download('stopwords')

Хотим сопоставить заголовку и абстракту частотную характеристику входящих в них слов. Для этого воспользуемся CountVectirize. Причём будем выбрасывать стоп слова. 

После этого воспользуемся CatBoost-м, предварительно представив таргет как одномерный вектор, а не one-hot.

In [11]:
stop_words = list(stopwords.words('english')) 
len(stop_words)

179

In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df['ABSTRACT'], train_df[labels], random_state=42
)

In [31]:
vectorizer = CountVectorizer(min_df=0.001,  
                             stop_words=stop_words)
X_train_transform = vectorizer.fit_transform(X_train).toarray()
X_val_transform = vectorizer.transform(X_val).toarray()

In [32]:
len(vectorizer.get_feature_names())

6986

In [33]:
train_dataset = Pool(data=X_train_transform,
                     label=np.argmax(y_train.values, axis=1))

model = CatBoostClassifier(
    silent=True, random_state=42, eval_metric='MultiClass')

model.fit(train_dataset)

<catboost.core.CatBoostClassifier at 0x7f79bf053f40>

In [34]:
cb_predictions = model.predict(X_val_transform).squeeze()
actuals = np.argmax(y_val.values, axis=1)
f1_score(actuals, cb_predictions, average='micro')

0.7932481403776463

**Вывод.** Кажется, получился достаточно неплохой скор. 

Посмотрим, как отработает модель на `TITLE`.

In [17]:
X_train_abs, X_val_abs, y_train, y_val = train_test_split(
    train_df['TITLE'], train_df[labels], random_state=42
)

vectorizer_abs = CountVectorizer(min_df=0.001,  
                             stop_words=stop_words)
X_train_abs_transform = vectorizer_abs.fit_transform(X_train_abs).toarray()
X_val_abs_transform = vectorizer_abs.transform(X_val_abs).toarray()

In [18]:
len(vectorizer_abs.get_feature_names())

1473

In [19]:
train_dataset_abs = Pool(data=X_train_abs_transform,
                     label=np.argmax(y_train.values, axis=1))

model_abs = CatBoostClassifier(
    silent=True, random_state=42, eval_metric='MultiClass')

model_abs.fit(train_dataset_abs)

<catboost.core.CatBoostClassifier at 0x7f79bec3bd90>

In [20]:
cb_predictions_abs = model_abs.predict(X_val_abs_transform).squeeze()
actuals = np.argmax(y_val.values, axis=1)
f1_score(actuals, cb_predictions_abs, average='micro')

0.7030326149151249

**Вывод.** Модель дала меньший скор.

# Что надо бы сделать, но я не сделал. 
* Стоит объеденить данные `ABSTRACT, TITLE` и восользоваться двумя колонками сразу. 
* Стоит перебрать гиперпараметры CatBoost-а и CountVectorize. 
* Можно было бы сделать две логистических регрессии на `ABSTRACT` и `TITLE`. А потом по сетке подобрать коэф-т для их бэгинга вероятностей. 
* Можно просто взять случайный лес, XGB, CatBoost, нейронную сеть и сделать бэгинг для них. 
* Взять NLP алгоритм для решения этой задачи. 

In [21]:
X_test = vectorizer.transform(test_df['ABSTRACT']).toarray()

In [22]:
cb_predictions = model.predict(X_test).squeeze()

In [23]:
answers = pd.DataFrame(cb_predictions.T, columns=['label'])

In [24]:
answers = pd.get_dummies(answers.label)
answers.columns = labels
answers['ID'] = test_df['ID']
answers

Unnamed: 0,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,ID
0,1,0,0,0,0,0,20973
1,0,1,0,0,0,0,20974
2,1,0,0,0,0,0,20975
3,0,1,0,0,0,0,20976
4,1,0,0,0,0,0,20977
...,...,...,...,...,...,...,...
8984,1,0,0,0,0,0,29957
8985,0,0,1,0,0,0,29958
8986,1,0,0,0,0,0,29959
8987,0,0,0,1,0,0,29960


In [25]:
answers.to_csv('answers.csv')