In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [3]:
data = pd.read_csv("train.csv")
X_test_competition = pd.read_csv("test.csv")

In [4]:
data

Unnamed: 0,ID,url,title,label
0,0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",0
1,1,www.kp.by,Эта песня стала известна многим телезрителям б...,0
2,2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,0
3,3,colorbox.spb.ru,Не Беси Меня Картинки,0
4,4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,0
...,...,...,...,...
135304,135304,mail.ru,пора тюльпанов турецкий сериал на русском язык...,0
135305,135305,www.ntv.ru,Остросюжетный сериал «Шеф. Игра на повышение»....,0
135306,135306,topclassiccarsforsale.com,"1941 Plymouth Special Deluxe Hot Rod, Automati...",0
135307,135307,wowcream.ru,Купить It's Skin Сыворотка питательная Power 1...,0


In [5]:
X = data[['url', 'title']]
y = data['label']

In [6]:
imputer = SimpleImputer(strategy='constant', fill_value='')
X_imputed = imputer.fit_transform(X)

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=5, max_df=0.8, sublinear_tf=True)
X_vectorized = vectorizer.fit_transform(X_imputed[:, 0] + ' ' + X_imputed[:, 1])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

In [10]:
param_grid = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf'],
    'svm__gamma': ['scale', 'auto']
}


In [16]:
X_train

<108247x73767 sparse matrix of type '<class 'numpy.float64'>'
	with 1638456 stored elements in Compressed Sparse Row format>

In [17]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train, with_mean=False)

60 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/raid/alebedev/myenvtr/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/raid/alebedev/myenvtr/lib/python3.7/site-packages/sklearn/pipeline.py", line 389, in fit
    fit_params_steps = self._check_fit_params(**fit_params)
  File "/raid/alebedev/myenvtr/lib/python3.7/site-packages/sklearn/pipeline.py", line 305, in _check_fit_params
    "=sample_weight)`.".format(pname)
ValueError: Pipeline.fit does not accept the with_mean parameter. You can pass parameters to specifi

ValueError: Pipeline.fit does not accept the with_mean parameter. You can pass parameters to specific steps of your pipeline using the stepname__parameter format, e.g. `Pipeline.fit(X, y, logisticregression__sample_weight=sample_weight)`.

In [None]:


# Get the best classifier from the grid search
best_classifier = grid_search.best_estimator_

# Make predictions on the test set using the best classifier
y_pred = best_classifier.predict(X_test)

# Evaluate the classifier's performance using F1-score and classification report
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"F1-score: {f1:.2f}")
print("Classification Report:")
print(report)