# Лабораторная работа №2

**Требования:**
* Python >= 3.X

Лабораторную работу необходимо выполнять в данном шаблоне. Результатом работы будет являться файл (с измененным именем), который необходимо выложить в Moodle.

**Важно!!!** Имя файлу задавайте по следующему шаблону **lab_2_Группа_ФамилияИО.ipynb**. Например: если Вас зовут Иванов Иван Иванович, и Вы обучаетесь в группе 6207_010302D, то имя файла будет выглядеть так **lab_2_6407_010302D_ИвановИИ.ipynb**.

Необходимо провести исследование различных способов представления документов и их влияние на качество определения тональности.

В качестве входных данных к лабораторной работе взят широко известный набор данных IMDB, содержащий 50K обзоров фильмов ([imdb-dataset-of-50k-movie-reviews](https://disk.yandex.ru/i/DDb0zuyUmts5QA)). Откликами являются значения двух классов positive и negative.

In [None]:
# Код загрузки данных
# Если хотите добавить какие-либо библиотеки
# добавляйте их ИМЕННО ЗДЕСЬ
import pandas as pd
import inflect
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV


imdb_data = pd.read_csv(r'/IMDB Dataset.csv')
imdb_data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


#### Шаг №1 Подготовка данных

Обязательно предобработайте данные!



In [None]:
infl = inflect.engine()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'<[^>]+>', '', text) #удаляем html теги
  text = re.sub("[!@#$%^&*(){}£\/'']", '',text) #удаляем специальные символы
  numbers = re.findall('(\d+)', text)
  for number in numbers:
    text = text.replace(number, infl.number_to_words(int(number)))
  tokens = nlp(text)
  filtered_tokens = []
  for token in tokens:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
  preprocessed_text = ' '.join(filtered_tokens)
  return preprocessed_text

# Применение предобработки к столбцу с отзывами
imdb_data['cleaned_review'] = imdb_data['review'].apply(preprocess_text)
#imdb_data.to_csv('preprocessed_reviews.csv', index=False)
imdb_data.head(5)

  text = re.sub("[!@#$%^&*(){}£\/'']", '',text)
  numbers = re.findall('(\d+)', text)


Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,reviewer mention watch oz episode ll hook righ...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production film technique una...
2,I thought this was a wonderful way to spend ti...,positive,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,basically s family little boy jake think s zom...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


In [None]:
sentiment_mapping = {'positive': 1, 'negative': 0}

imdb_data['sentiment_encoded'] = imdb_data['sentiment'].map(sentiment_mapping)

imdb_data.head(5)

Unnamed: 0,review,sentiment,cleaned_review,sentiment_encoded
0,One of the other reviewers has mentioned that ...,positive,reviewer mention watch oz episode ll hook righ...,1
1,A wonderful little production. <br /><br />The...,positive,wonderful little production film technique una...,1
2,I thought this was a wonderful way to spend ti...,positive,think wonderful way spend time hot summer week...,1
3,Basically there's a family where a little boy ...,negative,basically s family little boy jake think s zom...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...,1


In [None]:
copy_size = len(imdb_data) // 2

imdb_data_cp = imdb_data.head(copy_size).copy()

print("Размер копии:", imdb_data_cp.shape)

Размер копии: (25000, 4)


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV

train_x, test_x, train_y, test_y = train_test_split(imdb_data_cp['cleaned_review'],
                                                    imdb_data_cp['sentiment_encoded'],
                                                    test_size=0.20, random_state=42,
                                                    stratify=imdb_data_cp['sentiment_encoded'])

В качестве исследуемых способов представления текстов необходимо рассмотреть:

#### 1.Компоненты вектора: частоты ([CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)).

In [None]:
def count_vectorize_text_data(train_x, test_x, strip_accents='unicode'):
    count_vectorizer = CountVectorizer(strip_accents=strip_accents)
    train_x_count_vect = count_vectorizer.fit_transform(train_x)
    test_x_count_vect = count_vectorizer.transform(test_x)
    return train_x_count_vect, test_x_count_vect

train_x_count_vect, test_x_count_vect = count_vectorize_text_data(train_x, test_x)

#### 2. Компоненты вектора: оценки tf-idf для слова ([TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)).

In [None]:
def tfidf_vectorize_text_data(train_x, test_x):
    tfidf_vectorizer = TfidfVectorizer()
    train_x_tfidf_vect = tfidf_vectorizer.fit_transform(train_x)
    test_x_tfidf_vect = tfidf_vectorizer.transform(test_x)
    return train_x_tfidf_vect, test_x_tfidf_vect

train_x_tfidf_vect, test_x_tfidf_vect = tfidf_vectorize_text_data(train_x, test_x)

#### 3. Компоненты вектора: частоты N-грам.

In [None]:
def ngrams_vectorize_text_data(train_x, test_x, ngram_range=(1, 1), strip_accents='unicode'):
    count_vectorizer = CountVectorizer(ngram_range=ngram_range, strip_accents=strip_accents)
    train_x_ngrams_vect = count_vectorizer.fit_transform(train_x)
    test_x_ngrams_vect = count_vectorizer.transform(test_x)
    return train_x_ngrams_vect, test_x_ngrams_vect

train_x_ngrams_vect, test_x_ngrams_vect = ngrams_vectorize_text_data(train_x, test_x, ngram_range=(3, 3))

### Шаг 2. Исследование моделей

<table>
		<tr>
			<td></td>
			<td>$y = 1$</td>
			<td>$y = 0$</td>
		</tr>
		<tr>
			<td>$a(x) = 1$</td>
			<td>True Positive (TP)</td>
			<td>False Positive (FP)</td>
		</tr>
    	<tr>
			<td>$a(x) = 0$</td>
			<td>False Negative (FN)</td>
			<td>True Negative (TN)</td>
		</tr>
</table>

В зависимости от способа представления оценить качество классификации как долю правильных ответов на выборке ($\operatorname{accuracy} = \frac{\operatorname{TP} + \operatorname{TN}}{\operatorname{TP} + \operatorname{TN} + \operatorname{FP} + \operatorname{FN}}$). Используйте перекрестную проверку ([cross_val_score](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html), [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html)).

Для каждого из нижеперечисленных моделей необходимо определить оптимальные гиперпараметры ([GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html))

Качество классификации оцениваем для следующих моделей:

#### 1. Машина опорных векторов ([SVC](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)).

In [None]:
#оценка качества классификации
def quality_evaluation(X, y, model, params):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    #поиск оптимальных гиперпараметров
    grid_search = GridSearchCV(model, params, cv=kf, scoring='accuracy', verbose=3, n_jobs = 12)
    grid_search.fit(X, y)

    print("Лучшие гиперпараметры:", grid_search.best_params_)
    print("Лучший показатель перекрестной проверки (accuracy):", grid_search.best_score_)
    print()
    return grid_search

In [None]:
model_svc = SVC()

parameters_svc = {
    'C': [10, 100],
    'gamma': [0.1, 0.001],
    'kernel': ['rbf', 'linear']
}

grid_search_count_vect_svc = quality_evaluation(train_x_count_vect, train_y, model_svc, parameters_svc)
#grid_search_tfidf_vect_svc = quality_evaluation(train_x_tfidf_vect, train_y, model_svc, parameters_svc)
#grid_search_ngrams_vect_svc = quality_evaluation(train_x_ngrams_vect, train_y, model_svc, parameters_svc)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Лучшие гиперпараметры: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
Лучший показатель перекрестной проверки (accuracy): 0.8754500000000001



In [None]:
grid_search_tfidf_vect_svc = quality_evaluation(train_x_tfidf_vect, train_y, model_svc, parameters_svc)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Лучшие гиперпараметры: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Лучший показатель перекрестной проверки (accuracy): 0.8782



In [None]:
grid_search_ngrams_vect_svc = quality_evaluation(train_x_ngrams_vect, train_y, model_svc, parameters_svc)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Лучшие гиперпараметры: {'C': 10, 'gamma': 0.1, 'kernel': 'linear'}
Лучший показатель перекрестной проверки (accuracy): 0.6883000000000001



#### 2. Случайный лес ([RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)).

In [None]:
model_rf = RandomForestClassifier()

parameters_rf = {
    'n_estimators': [50, 150],
    'max_depth': [None, 20],
    'min_samples_leaf': [2, 4]
}

grid_search_count_vect_rf = quality_evaluation(train_x_count_vect, train_y, model_rf, parameters_rf)
grid_search_tfidf_vect_rf = quality_evaluation(train_x_tfidf_vect, train_y, model_rf, parameters_rf)
grid_search_ngrams_vect_rf = quality_evaluation(train_x_ngrams_vect, train_y, model_rf, parameters_rf)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Лучшие гиперпараметры: {'max_depth': None, 'min_samples_leaf': 2, 'n_estimators': 150}
Лучший показатель перекрестной проверки (accuracy): 0.8567

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Лучшие гиперпараметры: {'max_depth': None, 'min_samples_leaf': 4, 'n_estimators': 150}
Лучший показатель перекрестной проверки (accuracy): 0.8508500000000001

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Лучшие гиперпараметры: {'max_depth': None, 'min_samples_leaf': 2, 'n_estimators': 150}
Лучший показатель перекрестной проверки (accuracy): 0.6657500000000001



### Шаг 3. Сравнение результатов

Сравнить точность обученных моделей. Найти наиболее точную.

In [None]:
model_results = {}

model_results['SVC_count_vect'] = {'accuracy': grid_search_count_vect_svc.best_score_,
                                   'best_params': grid_search_count_vect_svc.best_params_}
model_results['SVC_tfidf_vect'] = {'accuracy': grid_search_tfidf_vect_svc.best_score_,
                                   'best_params': grid_search_tfidf_vect_svc.best_params_}
model_results['SVC_ngrams_vect'] = {'accuracy': grid_search_ngrams_vect_svc.best_score_,
                                    'best_params': grid_search_ngrams_vect_svc.best_params_}
model_results['RandomForest_count_vect'] = {'accuracy': grid_search_count_vect_rf.best_score_,
                                            'best_params': grid_search_count_vect_rf.best_params_}
model_results['RandomForest_tfidf_vect'] = {'accuracy': grid_search_tfidf_vect_rf.best_score_,
                                            'best_params': grid_search_tfidf_vect_rf.best_params_}
model_results['RandomForest_ngrams_vect'] = {'accuracy': grid_search_ngrams_vect_rf.best_score_,
                                             'best_params': grid_search_ngrams_vect_rf.best_params_}

best_model = max(model_results, key=lambda x: model_results[x]['accuracy'])

print("Наиболее точная модель:", best_model)
print("Лучший показатель точности:", model_results[best_model]['accuracy'])
print("Лучшие параметры:", model_results[best_model]['best_params'])

Наиболее точная модель: SVC_tfidf_vect
Лучший показатель точности: 0.8782
Лучшие параметры: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


А теперь на всех данных...

In [None]:
train_x_all, test_x_all, train_y_all, test_y_all = train_test_split(imdb_data['cleaned_review'],
                                                    imdb_data['sentiment_encoded'],
                                                    test_size=0.20, random_state=42,
                                                    stratify=imdb_data['sentiment_encoded'])

In [None]:
best_model_name = best_model.split('_')[0]
best_model_type = best_model.split('_')[1]
best_params = model_results[best_model]['best_params']

if best_model_name == 'SVC':
    best_params['C'] = [best_params['C']]
    best_params['gamma'] = [best_params['gamma']]
    best_params['kernel'] = [best_params['kernel']]
    if best_model_type == 'count':
        train_x_count_vect, test_x_count_vect = count_vectorize_text_data(train_x_all, test_x_all)
        grid_search_count_vect_svc_all = quality_evaluation(train_x_count_vect, train_y_all, model_svc, best_params)
    elif best_model_type == 'tfidf':
        train_x_tfidf_vect, test_x_tfidf_vect = tfidf_vectorize_text_data(train_x_all, test_x_all)
        grid_search_tfidf_vect_svc_all = quality_evaluation(train_x_tfidf_vect, train_y_all, model_svc, best_params)
    elif best_model_type == 'ngrams':
        train_x_ngrams_vect, test_x_ngrams_vect = ngrams_vectorize_text_data(train_x_all, test_x_all, ngram_range=(3, 3))
        grid_search_ngrams_vect_svc_all = quality_evaluation(train_x_ngrams_vect, train_y_all, model_svc, best_params)
elif best_model_name == 'RandomForest':
    best_params['n_estimators'] = [best_params['n_estimators']]
    best_params['max_depth'] = [best_params['max_depth']]
    best_params['min_samples_leaf'] = [best_params['min_samples_leaf']]
    if best_model_type == 'count':
        train_x_count_vect, test_x_count_vect = count_vectorize_text_data(train_x_all, test_x_all)
        grid_search_count_vect_rf_all = quality_evaluation(train_x_count_vect, train_y_all, model_rf, best_params)
    elif best_model_type == 'tfidf':
        train_x_tfidf_vect, test_x_tfidf_vect = tfidf_vectorize_text_data(train_x_all, test_x_all)
        grid_search_tfidf_vect_rf_all = quality_evaluation(train_x_tfidf_vect, train_y_all, model_rf, best_params)
    elif best_model_type == 'ngrams':
        train_x_ngrams_vect, test_x_ngrams_vect = ngrams_vectorize_text_data(train_x_all, test_x_all, ngram_range=(3, 3))
        grid_search_ngrams_vect_rf = quality_evaluation(train_x_ngrams_vect, train_y_all, model_rf, best_params)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Лучшие гиперпараметры: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Лучший показатель перекрестной проверки (accuracy): 0.8875

