# Импорты

In [1]:
import warnings
import pandas as pd
from feature_extraction import *
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings("ignore")

# Преобразовываем датасет

In [3]:
def vectorize_text(df: pd.DataFrame) -> pd.DataFrame:
    y = df['class'].apply(lambda y: 1 if y == 'ham' else -1).to_numpy()
    vectorizer = TfidfVectorizer(min_df=0.005)
    X = vectorizer.fit_transform(df.iloc[:, 1]).toarray()
    columns = vectorizer.get_feature_names_out()
    return X, y, columns


df = pd.read_csv('data/SMS.tsv', sep='\t')
X, y, columns = vectorize_text(df)

# Фильтрирующий метод отбора признаков

## Моя реализация

In [4]:
filter_method = FilterFeatureSelector(30)
mutual_infos = filter_method.mutual_information(X, y)
my_filter_features = filter_method.get_n_features(columns)
print("Выбранные признаки:\n", " ".join(my_filter_features))

Выбранные признаки:
 my me it in but that you so not can how ok at gt lt do ll and when come got what if up then its good he like was


## SKlearn реализация

In [5]:
selector = SelectKBest(k=30)
selector.fit(pd.DataFrame(X, columns=columns), pd.DataFrame(y))
sklearn_filter_features = selector.get_feature_names_out()
print("Выбранные признаки:\n", " ".join(sklearn_filter_features))

Выбранные признаки:
 1000 150p 16 18 50 500 awarded call cash claim co contact cs free guaranteed mobile nokia prize reply service stop text to tone txt uk urgent win won www


## Сравнение

In [6]:
print("Количество совпадений = ", len(set(my_filter_features).intersection(sklearn_filter_features)))

Количество совпадений =  0


# Встроенный метод

## Моя реализация

In [7]:
X, y = pd.DataFrame(X, columns=columns), pd.DataFrame(y)

In [8]:
my_embedded_selector = EmbeddedFeatureSelector(n_features=30)
my_embedded_selector.fit(X, y)
my_embedded_features = my_embedded_selector.selected_features_
print("Выбранные признаки:\n", " ".join(my_embedded_features))

Выбранные признаки:
 from won code cash contact urgent co 500 18 16 com now 50 or win text your service reply to stop uk 150p claim www prize mobile free txt call


## Sklearn реализация

In [9]:
embedded_selector = SelectFromModel(estimator=DecisionTreeClassifier(), max_features=30)
embedded_selector.fit(X, y)
sklearn_embedded_features = embedded_selector.get_feature_names_out()
print("Выбранные признаки:\n", " ".join(sklearn_embedded_features))

Выбранные признаки:
 150p 18 call chat claim free from get gt have in me min mobile my nokia now real ringtone service shows sms stop to txt uk win www you your


## Сравнение

In [10]:
print("Количество совпадений = ", len(set(my_embedded_features).intersection(sklearn_embedded_features)))

Количество совпадений =  16


# Метод обёртка

## Моя реализация

In [11]:
wrapper_selector = WrapperFeatureSelector()
my_wrapper_features = wrapper_selector.select_features(X, y, 30)
print("Выбранные признаки:\n", " ".join(my_wrapper_features))

Выбранные признаки:
 txt claim mobile www service 150p 16 50 video landline uk rate code orange ringtone my prize free 500 at line 18 text apply something cool network pa pay every


## Sklearn реализация

In [12]:
svc_ectimator = LinearSVC(dual='auto')
selector = RFE(estimator=svc_ectimator, n_features_to_select=30)
selector.fit(X, y)
sklearn_wrapper_features = selector.get_feature_names_out()
print("Выбранные признаки:\n", " ".join(sklearn_wrapper_features))

Выбранные признаки:
 100 1000 150p 18 50 500 apply awarded chat claim code landline latest lt mobile nokia orange per prize rate reply ringtone service text tone txt uk urgent video www


## Сравнение

In [13]:
print("Количество совпадений = ", len(set(my_wrapper_features).intersection(sklearn_wrapper_features)))

Количество совпадений =  19


# Обучение 3-х классификаторах на новом признаковом пространстве

In [15]:
from sklearn.model_selection import train_test_split

selected_features = {'embedded': my_embedded_features, 'filter': my_filter_features,
                     'wrapper': my_wrapper_features}
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [16]:
def test_train_estimator(cls, model_name):
    classifier = cls()
    classifier.fit(X_train, y_train)
    print(model_name)
    print('Before selection: ', f1_score(y_test, classifier.predict(X_test)))
    for (method, features) in selected_features.items():
        X_subset_train = X_train[features]
        X_subset_test = X_test[features]
        classifier.fit(X_subset_train, y_train)
        print(f'After {method} selection: {f1_score(y_test, classifier.predict(X_subset_test))}')

In [17]:
test_train_estimator(DecisionTreeClassifier, "DECISION TREE")

DECISION TREE
Before selection:  0.9818430969510106
After embedded selection: 0.9777929620772121
After filter selection: 0.934453781512605
After wrapper selection: 0.9787234042553191


In [18]:
test_train_estimator(GradientBoostingClassifier, "GRADIENT BOOSTING")

GRADIENT BOOSTING
Before selection:  0.9894378194207837
After embedded selection: 0.9874106839060905
After filter selection: 0.9382875605815832
After wrapper selection: 0.9803523035230353


In [19]:
test_train_estimator(KNeighborsClassifier, "KNN")

KNN
Before selection:  0.9674634794156707
After embedded selection: 0.9803921568627451
After filter selection: 0.938255033557047
After wrapper selection: 0.9826471588975842
