# Импорты

In [55]:
import warnings
import pandas as pd
from feature_extraction import *
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings("ignore")

# Преобразовываем датасет

In [2]:
def vectorize_text(df: pd.DataFrame) -> pd.DataFrame:
    y = df['class'].apply(lambda y: 1 if y == 'ham' else -1).to_numpy()
    vectorizer = TfidfVectorizer(min_df=0.005)
    X = vectorizer.fit_transform(df.iloc[:, 1]).toarray()
    columns = vectorizer.get_feature_names_out()
    return X, y, columns


df = pd.read_csv('data/SMS.tsv', sep='\t')
X, y, columns = vectorize_text(df)

# Фильтрирующий метод отбора признаков

## Моя реализация

In [3]:
filter_method = FilterFeatureSelector(30)
mutual_infos = filter_method.mutual_information(X, y)
my_filter_features = filter_method.get_n_features(columns)
print("Выбранные признаки:\n", " ".join(my_filter_features))

## SKlearn реализация

In [4]:
selector = SelectKBest(k=30)
selector.fit(pd.DataFrame(X, columns=columns), pd.DataFrame(y))
sklearn_filter_features = selector.get_feature_names_out()
print("Выбранные признаки:\n", " ".join(sklearn_filter_features))

## Сравнение

In [5]:
print("Количество совпадений = ", len(set(my_filter_features).intersection(sklearn_filter_features)))

# Встроенный метод

## Моя реализация

In [35]:
X, y = pd.DataFrame(X, columns=columns), pd.DataFrame(y)

In [36]:
my_embedded_selector = EmbeddedFeatureSelector(n_features=30)
my_embedded_selector.fit(X, y)
my_embedded_features = my_embedded_selector.selected_features_
print("Выбранные признаки:\n", " ".join(my_embedded_features))

## Sklearn реализация

In [37]:
embedded_selector = SelectFromModel(estimator=DecisionTreeClassifier(), max_features=30)
embedded_selector.fit(X, y)
sklearn_embedded_features = embedded_selector.get_feature_names_out()
print("Выбранные признаки:\n", " ".join(sklearn_embedded_features))

## Сравнение

In [38]:
print("Количество совпадений = ", len(set(my_embedded_features).intersection(sklearn_embedded_features)))

# Метод обёртка

## Моя реализация

In [39]:
wrapper_selector = WrapperFeatureSelector()
my_wrapper_features = wrapper_selector.select_features(X, y, 30)
print("Выбранные признаки:\n", " ".join(my_wrapper_features))

## Sklearn реализация

In [40]:
svc_ectimator = LinearSVC(dual='auto')
selector = RFE(estimator=svc_ectimator, n_features_to_select=30)
selector.fit(X, y)
sklearn_wrapper_features = selector.get_feature_names_out()
print("Выбранные признаки:\n", " ".join(sklearn_wrapper_features))

## Сравнение

In [41]:
print("Количество совпадений = ", len(set(my_wrapper_features).intersection(sklearn_wrapper_features)))

# Обучение 3-х классификаторах на новом признаковом пространстве

In [44]:
from sklearn.model_selection import train_test_split

selected_features = {'embedded': my_embedded_features, 'filter': my_filter_features,
                     'wrapper': my_wrapper_features}
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [57]:
def test_train_estimator(cls, model_name):
    classifier = cls()
    classifier.fit(X_train, y_train)
    print(model_name)
    print('Before selection: ', f1_score(y_test, classifier.predict(X_test)))
    for (method, features) in selected_features.items():
        X_subset_train = X_train[features]
        X_subset_test = X_test[features]
        classifier.fit(X_subset_train, y_train)
        print(f'After {method} selection: {f1_score(y_test, classifier.predict(X_subset_test))}')

In [58]:
test_train_estimator(DecisionTreeClassifier, "DECISION TREE")

In [59]:
test_train_estimator(GradientBoostingClassifier, "GRADIENT BOOSTING")

In [60]:
test_train_estimator(KNeighborsClassifier, "KNN")