In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import RepeatedKFold, ParameterGrid
from tqdm.notebook import tqdm, trange

In [2]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,url,title,target
0,0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",False
1,1,www.kp.by,Эта песня стала известна многим телезрителям б...,False
2,2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,False
3,3,colorbox.spb.ru,Не Беси Меня Картинки,False
4,4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,False


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135309 entries, 0 to 135308
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      135309 non-null  int64 
 1   url     135309 non-null  object
 2   title   135309 non-null  object
 3   target  135309 non-null  bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 3.2+ MB


In [4]:
train_df.describe()

Unnamed: 0,id
count,135309.0
mean,67654.0
std,39060.488124
min,0.0
25%,33827.0
50%,67654.0
75%,101481.0
max,135308.0


In [5]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,id,url,title
0,135309,www.kommersant.ru,Шестой кассационный суд в Самаре начнет работу...
1,135310,urexpert.online,"Что такое индексация алиментов, кем и в каких ..."
2,135311,imperimeha.ru,Женщинам | Империя Меха - Part 12
3,135312,national-porn.com,"Небритые, волосатые киски: Порно всех стран и ..."
4,135313,2gis.ru,67


In [6]:
data = train_df['title'].values
target = train_df['target'].astype(int).values

In [7]:
X_test = test_df['title'].values

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
def cross_validation(clf, data, target):   
    kf = RepeatedKFold()
    vectorizer = TfidfVectorizer()
    parameters_score = []
    for train_index, test_index in tqdm(kf.split(data), total=50, leave=False):
        X_train = vectorizer.fit_transform(data[train_index])
        X_test = vectorizer.transform(data[test_index])
        y_train, y_test = target[train_index], target[test_index]
        clf.fit(X_train, y_train) 
        parameters_score.append(f1_score(y_test, clf.predict(X_test)))
    return np.mean(np.asarray(parameters_score))

In [10]:
clf = LogisticRegression(C=100, solver='liblinear', dual=True, max_iter=1000)
cross_validation(clf, data, target)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

0.951321098460759

In [11]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(data)
y_train = target
clf.fit(X_train, y_train)
test_df['target'] = clf.predict(vectorizer.transform(X_test)).astype(bool)
test_df[['id', 'target']].to_csv("predictions.csv", index=False)