# Описание проекта
**Цель проекта:** разработать ML решения для автоматического определения уровня сложности англоязычных фильмов.

**Исходные данные:** размеченный датасет с названиями фильмов, субтитрами и меткой уровня сложности языка (A1/A2/B1/B2/C1/C2).

# Импортируем библиотеки

In [1]:
!pip install pysrt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pysrt
  Downloading pysrt-1.1.2.tar.gz (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.4/104.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pysrt
  Building wheel for pysrt (setup.py) ... [?25l[?25hdone
  Created wheel for pysrt: filename=pysrt-1.1.2-py3-none-any.whl size=13443 sha256=b417954ba61b372d0ef5f4c2e2a34530c7903b83aa038fc44778dc78ff45796a
  Stored in directory: /root/.cache/pip/wheels/30/7f/e8/55de9a9b07302d9e7fe47c27910e3bea0c48536153e74bd7e6
Successfully built pysrt
Installing collected packages: pysrt
Successfully installed pysrt-1.1.2


In [2]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [3]:
import pandas as pd
import pysrt
import nltk
import os
import glob
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import spacy

from catboost import CatBoostClassifier
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from xgboost import XGBClassifier

In [4]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Загрузим данные

In [5]:
## Подключим библиотеки для загрузки файла с GoogleDisk
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
%cd /content/drive/MyDrive/Colab Notebooks/data

/content/drive/MyDrive/Colab Notebooks/data


In [8]:
labels = pd.read_csv('labels.csv')

# Проведем EDA

In [9]:
# Посмотрим на первые строки датасета
labels.head()

Unnamed: 0,Movie,Kinopoisk,Level,Subtitles
0,Forrest Gump,Rus sub,"A2/A2+, B1",Yes
1,Finding Nemo\n,Everything,A2/A2+,Yes
2,Cast away\n,"Paid, Rus sub",A2/A2+,Yes
3,The invisible man (2020)\n,"Paid, Rus lan",A2/A2+,Yes
4,Back to the future\n,Rus sub,A2/A2+,Yes


In [10]:
# Посмотрим на общую информацию о данных
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Movie       88 non-null     object
 1   Kinopoisk   69 non-null     object
 2   Level       88 non-null     object
 3   Subtitles   88 non-null     object
dtypes: object(4)
memory usage: 2.9+ KB


In [11]:
# ПОсмотрим количество пропусков
labels.isna().sum()

Movie          0
Kinopoisk     19
Level          0
Subtitles      0
dtype: int64

## Создадим столбец с субтитрами к каждому фильму
Совпадение будем искать по первым двум словам в названии фильма, если слов два и более

### Создадим датафрейм из папки с субтитрами, чтобы потом объединить с датафреймом списка фильмов

Зададим путь до папки с субтитрами и создадим список файлов в этой папке

In [12]:
%cd /content/drive/MyDrive/Colab Notebooks/data/Subtitles

/content/drive/MyDrive/Colab Notebooks/data/Subtitles


In [13]:
srt_files = glob.glob(os.path.join("*.srt"))

In [14]:
srt_files

['Die_hard(1988).srt',
 'Liar_liar(1997).srt',
 'The_jungle_book(2016).srt',
 'Back_to_the_future(1985).srt',
 'Harry_Potter_and_the_philosophers_stone(2001).srt',
 'Before_sunrise(1995).srt',
 '10_things_I_hate_about_you(1999).srt',
 'Before_sunset(2004).srt',
 'The_break-up(2006).srt',
 'The_cabin_in_the_woods(2012).srt',
 'Inside_out(2015).srt',
 'Her(2013).srt',
 'The_invisible_man(2020).srt',
 'Good_Will_Hunting(1997).srt',
 'Pulp_fiction(1994).srt',
 'All_dogs_go_to_heaven(1989).srt',
 'House_of_Gucci(2021).srt',
 'Fight_club(1999).srt',
 'Cast_away(2000).srt',
 'Mamma_Mia(2008).srt',
 'Clueless(1995).srt',
 'Bridget_Jones_diary(2001).srt',
 'The_greatest_showman(2017).srt',
 'My_big_fat_Greek_wedding(2002).srt',
 'Forrest_Gump(1994).srt',
 'Shrek(2001).srt',
 'Mrs_Doubtfire(1993).srt',
 'Logan(2017).srt',
 'Love_actually(2003).srt',
 'Soul(2020).srt',
 'Pleasantville(1998).srt',
 'Powder(1995).srt',
 'Knives_out(2019).srt',
 'Home_alone(1990).srt',
 'Batman_begins(2005).srt',
 '

Создадим датафрейм, где первый столбец будет именем файла, а второй содержать субтитры

In [15]:
srt_df = pd.DataFrame(srt_files)

In [16]:
%%time
srt_list = []
for file in srt_files:
    srt_list.append(pysrt.open(file))

CPU times: user 6.73 s, sys: 159 ms, total: 6.89 s
Wall time: 31.7 s


In [17]:
srt_df['srt'] = srt_list

Проверим, что получилось

In [18]:
srt_df.head()

Unnamed: 0,0,srt
0,Die_hard(1988).srt,"[1\n00:00:56,200 --> 00:00:58,300\nYou don't l..."
1,Liar_liar(1997).srt,"[1\n00:00:53,290 --> 00:00:56,920\nW- O\n, 2\n..."
2,The_jungle_book(2016).srt,"[1\n00:00:40,041 --> 00:00:42,085\n(BIRDS AND ..."
3,Back_to_the_future(1985).srt,"[0\n00:01:28,550 --> 00:01:30,259\nOctober is ..."
4,Harry_Potter_and_the_philosophers_stone(2001).srt,"[1\n00:01:22,065 --> 00:01:27,070\nI should've..."


In [19]:
srt_df = srt_df.rename(columns={0 : "file_name", "srt": "srt_text"})

Создадим столбец с коротким названием фильма (из первых двух слов) без знаков препинания и разделителей (чтобы в случае использования разных символов названия фильмов совпали)

In [20]:
srt_df['short_name'] = srt_df['file_name'].str.split("/").str[-1:]

In [21]:
srt_df.head()

Unnamed: 0,file_name,srt_text,short_name
0,Die_hard(1988).srt,"[1\n00:00:56,200 --> 00:00:58,300\nYou don't l...",[Die_hard(1988).srt]
1,Liar_liar(1997).srt,"[1\n00:00:53,290 --> 00:00:56,920\nW- O\n, 2\n...",[Liar_liar(1997).srt]
2,The_jungle_book(2016).srt,"[1\n00:00:40,041 --> 00:00:42,085\n(BIRDS AND ...",[The_jungle_book(2016).srt]
3,Back_to_the_future(1985).srt,"[0\n00:01:28,550 --> 00:01:30,259\nOctober is ...",[Back_to_the_future(1985).srt]
4,Harry_Potter_and_the_philosophers_stone(2001).srt,"[1\n00:01:22,065 --> 00:01:27,070\nI should've...",[Harry_Potter_and_the_philosophers_stone(2001)...


In [22]:
srt_df['short_name'] = [''.join(map(str, l)) for l in srt_df['short_name']]

In [23]:
srt_df.head()

Unnamed: 0,file_name,srt_text,short_name
0,Die_hard(1988).srt,"[1\n00:00:56,200 --> 00:00:58,300\nYou don't l...",Die_hard(1988).srt
1,Liar_liar(1997).srt,"[1\n00:00:53,290 --> 00:00:56,920\nW- O\n, 2\n...",Liar_liar(1997).srt
2,The_jungle_book(2016).srt,"[1\n00:00:40,041 --> 00:00:42,085\n(BIRDS AND ...",The_jungle_book(2016).srt
3,Back_to_the_future(1985).srt,"[0\n00:01:28,550 --> 00:01:30,259\nOctober is ...",Back_to_the_future(1985).srt
4,Harry_Potter_and_the_philosophers_stone(2001).srt,"[1\n00:01:22,065 --> 00:01:27,070\nI should've...",Harry_Potter_and_the_philosophers_stone(2001).srt


In [24]:
srt_df['short_name'] = srt_df['short_name'].str.split("(").str[:1]

In [25]:
srt_df.head()

Unnamed: 0,file_name,srt_text,short_name
0,Die_hard(1988).srt,"[1\n00:00:56,200 --> 00:00:58,300\nYou don't l...",[Die_hard]
1,Liar_liar(1997).srt,"[1\n00:00:53,290 --> 00:00:56,920\nW- O\n, 2\n...",[Liar_liar]
2,The_jungle_book(2016).srt,"[1\n00:00:40,041 --> 00:00:42,085\n(BIRDS AND ...",[The_jungle_book]
3,Back_to_the_future(1985).srt,"[0\n00:01:28,550 --> 00:01:30,259\nOctober is ...",[Back_to_the_future]
4,Harry_Potter_and_the_philosophers_stone(2001).srt,"[1\n00:01:22,065 --> 00:01:27,070\nI should've...",[Harry_Potter_and_the_philosophers_stone]


In [26]:
srt_df['short_name'] = [''.join(map(str, l)) for l in srt_df['short_name']]
srt_df.head()

Unnamed: 0,file_name,srt_text,short_name
0,Die_hard(1988).srt,"[1\n00:00:56,200 --> 00:00:58,300\nYou don't l...",Die_hard
1,Liar_liar(1997).srt,"[1\n00:00:53,290 --> 00:00:56,920\nW- O\n, 2\n...",Liar_liar
2,The_jungle_book(2016).srt,"[1\n00:00:40,041 --> 00:00:42,085\n(BIRDS AND ...",The_jungle_book
3,Back_to_the_future(1985).srt,"[0\n00:01:28,550 --> 00:01:30,259\nOctober is ...",Back_to_the_future
4,Harry_Potter_and_the_philosophers_stone(2001).srt,"[1\n00:01:22,065 --> 00:01:27,070\nI should've...",Harry_Potter_and_the_philosophers_stone


In [27]:
srt_df['short_name'] = srt_df['short_name'].str.split("_").str[:2]
srt_df.head()

Unnamed: 0,file_name,srt_text,short_name
0,Die_hard(1988).srt,"[1\n00:00:56,200 --> 00:00:58,300\nYou don't l...","[Die, hard]"
1,Liar_liar(1997).srt,"[1\n00:00:53,290 --> 00:00:56,920\nW- O\n, 2\n...","[Liar, liar]"
2,The_jungle_book(2016).srt,"[1\n00:00:40,041 --> 00:00:42,085\n(BIRDS AND ...","[The, jungle]"
3,Back_to_the_future(1985).srt,"[0\n00:01:28,550 --> 00:01:30,259\nOctober is ...","[Back, to]"
4,Harry_Potter_and_the_philosophers_stone(2001).srt,"[1\n00:01:22,065 --> 00:01:27,070\nI should've...","[Harry, Potter]"


In [28]:
srt_df['short_name'] = [''.join(map(str, l)) for l in srt_df['short_name']]
srt_df.head()

Unnamed: 0,file_name,srt_text,short_name
0,Die_hard(1988).srt,"[1\n00:00:56,200 --> 00:00:58,300\nYou don't l...",Diehard
1,Liar_liar(1997).srt,"[1\n00:00:53,290 --> 00:00:56,920\nW- O\n, 2\n...",Liarliar
2,The_jungle_book(2016).srt,"[1\n00:00:40,041 --> 00:00:42,085\n(BIRDS AND ...",Thejungle
3,Back_to_the_future(1985).srt,"[0\n00:01:28,550 --> 00:01:30,259\nOctober is ...",Backto
4,Harry_Potter_and_the_philosophers_stone(2001).srt,"[1\n00:01:22,065 --> 00:01:27,070\nI should've...",HarryPotter


In [29]:
srt_df['short_name'] = srt_df['short_name'].str.lower()
srt_df.head()

Unnamed: 0,file_name,srt_text,short_name
0,Die_hard(1988).srt,"[1\n00:00:56,200 --> 00:00:58,300\nYou don't l...",diehard
1,Liar_liar(1997).srt,"[1\n00:00:53,290 --> 00:00:56,920\nW- O\n, 2\n...",liarliar
2,The_jungle_book(2016).srt,"[1\n00:00:40,041 --> 00:00:42,085\n(BIRDS AND ...",thejungle
3,Back_to_the_future(1985).srt,"[0\n00:01:28,550 --> 00:01:30,259\nOctober is ...",backto
4,Harry_Potter_and_the_philosophers_stone(2001).srt,"[1\n00:01:22,065 --> 00:01:27,070\nI should've...",harrypotter


In [30]:
srt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   file_name   86 non-null     object
 1   srt_text    86 non-null     object
 2   short_name  86 non-null     object
dtypes: object(3)
memory usage: 2.1+ KB


### Создадим столбец short_name в таблице с фильмами

In [31]:
labels.head()

Unnamed: 0,Movie,Kinopoisk,Level,Subtitles
0,Forrest Gump,Rus sub,"A2/A2+, B1",Yes
1,Finding Nemo\n,Everything,A2/A2+,Yes
2,Cast away\n,"Paid, Rus sub",A2/A2+,Yes
3,The invisible man (2020)\n,"Paid, Rus lan",A2/A2+,Yes
4,Back to the future\n,Rus sub,A2/A2+,Yes


In [32]:
labels['short_name'] = labels['Movie'].str.split("\n")
labels['short_name'] = labels['short_name'].str[:1]
labels['short_name'] = [''.join(map(str, l)) for l in labels['short_name']]

In [33]:
labels.head()

Unnamed: 0,Movie,Kinopoisk,Level,Subtitles,short_name
0,Forrest Gump,Rus sub,"A2/A2+, B1",Yes,Forrest Gump
1,Finding Nemo\n,Everything,A2/A2+,Yes,Finding Nemo
2,Cast away\n,"Paid, Rus sub",A2/A2+,Yes,Cast away
3,The invisible man (2020)\n,"Paid, Rus lan",A2/A2+,Yes,The invisible man (2020)
4,Back to the future\n,Rus sub,A2/A2+,Yes,Back to the future


In [34]:
labels['short_name'] = labels['short_name'].str.replace("  "," ")

In [35]:
labels['short_name'] = labels['short_name'].str.split(' ')
labels['short_name'] = labels['short_name'].str[:2]
labels['short_name'] = [''.join(map(str, l)) for l in labels['short_name']]

In [36]:
labels['short_name'] = labels['short_name'].str.replace("’","").str.replace(",","").str.replace(".","").str.lower()

  labels['short_name'] = labels['short_name'].str.replace("’","").str.replace(",","").str.replace(".","").str.lower()


Вручную переименуем We're the Millers и It's a wonderful ...

In [37]:
labels.loc[labels['short_name'] == 'werethe', 'short_name'] = 'weare'
labels.loc[labels['short_name'] == 'itsa', 'short_name'] = 'its'

In [38]:
labels.head()

Unnamed: 0,Movie,Kinopoisk,Level,Subtitles,short_name
0,Forrest Gump,Rus sub,"A2/A2+, B1",Yes,forrestgump
1,Finding Nemo\n,Everything,A2/A2+,Yes,findingnemo
2,Cast away\n,"Paid, Rus sub",A2/A2+,Yes,castaway
3,The invisible man (2020)\n,"Paid, Rus lan",A2/A2+,Yes,theinvisible
4,Back to the future\n,Rus sub,A2/A2+,Yes,backto


In [39]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Movie       88 non-null     object
 1   Kinopoisk   69 non-null     object
 2   Level       88 non-null     object
 3   Subtitles   88 non-null     object
 4   short_name  88 non-null     object
dtypes: object(5)
memory usage: 3.6+ KB


In [40]:
srt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   file_name   86 non-null     object
 1   srt_text    86 non-null     object
 2   short_name  86 non-null     object
dtypes: object(3)
memory usage: 2.1+ KB


### Объединим таблицы

In [41]:
big_df = labels.merge(srt_df, on='short_name' , how='outer')
big_df.head()

Unnamed: 0,Movie,Kinopoisk,Level,Subtitles,short_name,file_name,srt_text
0,Forrest Gump,Rus sub,"A2/A2+, B1",Yes,forrestgump,Forrest_Gump(1994).srt,"[1\n00:03:19,349 --> 00:03:20,599\nHello.\n, 2..."
1,Finding Nemo\n,Everything,A2/A2+,Yes,findingnemo,Finding_Nemo(2003).srt,"[1\n00:00:03,203 --> 00:00:05,194\n[Music play..."
2,Cast away\n,"Paid, Rus sub",A2/A2+,Yes,castaway,Cast_away(2000).srt,"[1\n00:00:05,000 --> 00:00:15,000\nCreated and..."
3,The invisible man (2020)\n,"Paid, Rus lan",A2/A2+,Yes,theinvisible,The_invisible_man(2020).srt,"[1\n00:02:56,135 --> 00:02:57,218\nAdrian?\n, ..."
4,Back to the future\n,Rus sub,A2/A2+,Yes,backto,Back_to_the_future(1985).srt,"[0\n00:01:28,550 --> 00:01:30,259\nOctober is ..."


In [42]:
big_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88 entries, 0 to 87
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Movie       88 non-null     object
 1   Kinopoisk   69 non-null     object
 2   Level       88 non-null     object
 3   Subtitles   88 non-null     object
 4   short_name  88 non-null     object
 5   file_name   86 non-null     object
 6   srt_text    86 non-null     object
dtypes: object(7)
memory usage: 5.5+ KB


In [43]:
big_df.isnull().sum()

Movie          0
Kinopoisk     19
Level          0
Subtitles      0
short_name     0
file_name      2
srt_text       2
dtype: int64

In [44]:
big_df[big_df.isnull().any(1)]

  big_df[big_df.isnull().any(1)]


Unnamed: 0,Movie,Kinopoisk,Level,Subtitles,short_name,file_name,srt_text
37,Pulp Fiction,,B2,Yes,pulpfiction,Pulp_fiction(1994).srt,"[1\n00:00:28,831 --> 00:00:32,927\nForget it. ..."
41,The hangover,,B2,Yes,thehangover,The_hangover(2009).srt,"[1\n00:00:38,580 --> 00:00:40,581\n[PHONE LINE..."
49,The king’s speech,,B2,Yes,thekings,The_kings_speech(2010).srt,"[1\n00:01:09,610 --> 00:01:13,860\n[Inaudible]..."
51,Beauty and the beast (film),,B2,Yes,beautyand,Beauty_and_the_beast(2017).srt,"[1\n00:00:43,600 --> 00:00:45,956\nOnce upon a..."
52,Before I go to sleep,,B2,Yes,beforei,Before_I_go_to_sleep(2014).srt,"[1\n00:02:50,375 --> 00:02:51,957\nWho are you..."
58,Lie to me (series),,"B1, B2",No,lieto,,
75,The Shawshank Redemption,,"B1, B2",Yes,theshawshank,The_Shawshank_redemption(1994).srt,"[1\n00:00:17,142 --> 00:00:19,226\n♪ THE INK S..."
76,Logan,,B1,Yes,logan,Logan(2017).srt,"[1\n00:00:58,157 --> 00:00:59,715\nMAN 1: We g..."
77,Braveheart,,B2,Yes,braveheart,Braveheart(1995).srt,"[1\n00:01:39,760 --> 00:01:43,190\n<i>I shall ..."
78,Moulin Rouge 🎙️,,"A2/A2+, B1",No,moulinrouge,Moulin_Rouge(2001).srt,"[1\n00:00:05,000 --> 00:00:12,000\nCreated and..."


Удалим из данных фильмы без субтитров

In [45]:
big_df = big_df.dropna(subset=['srt_text'])

In [46]:
big_df[big_df.isnull().any(1)]

  big_df[big_df.isnull().any(1)]


Unnamed: 0,Movie,Kinopoisk,Level,Subtitles,short_name,file_name,srt_text
37,Pulp Fiction,,B2,Yes,pulpfiction,Pulp_fiction(1994).srt,"[1\n00:00:28,831 --> 00:00:32,927\nForget it. ..."
41,The hangover,,B2,Yes,thehangover,The_hangover(2009).srt,"[1\n00:00:38,580 --> 00:00:40,581\n[PHONE LINE..."
49,The king’s speech,,B2,Yes,thekings,The_kings_speech(2010).srt,"[1\n00:01:09,610 --> 00:01:13,860\n[Inaudible]..."
51,Beauty and the beast (film),,B2,Yes,beautyand,Beauty_and_the_beast(2017).srt,"[1\n00:00:43,600 --> 00:00:45,956\nOnce upon a..."
52,Before I go to sleep,,B2,Yes,beforei,Before_I_go_to_sleep(2014).srt,"[1\n00:02:50,375 --> 00:02:51,957\nWho are you..."
75,The Shawshank Redemption,,"B1, B2",Yes,theshawshank,The_Shawshank_redemption(1994).srt,"[1\n00:00:17,142 --> 00:00:19,226\n♪ THE INK S..."
76,Logan,,B1,Yes,logan,Logan(2017).srt,"[1\n00:00:58,157 --> 00:00:59,715\nMAN 1: We g..."
77,Braveheart,,B2,Yes,braveheart,Braveheart(1995).srt,"[1\n00:01:39,760 --> 00:01:43,190\n<i>I shall ..."
78,Moulin Rouge 🎙️,,"A2/A2+, B1",No,moulinrouge,Moulin_Rouge(2001).srt,"[1\n00:00:05,000 --> 00:00:12,000\nCreated and..."
79,The Greatest Showman 🎙️,,A2/A2+,Yes,thegreatest,The_greatest_showman(2017).srt,"[1\n00:00:09,877 --> 00:00:13,081\n♪ Whoa ♪\n,..."


## Обработка субтитров

### WordNetLemmatizer

In [47]:
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

Создадим функцию для очистки текстов

In [48]:
def clean_text(text):
    '''
    функция принимает на вход текст, возвращает очищенный от лишних знаков и слов текст
    '''
    text = text.text.lower()

    text = re.sub(r"[^a-zA-Z]+", " ", text) # заменим всё, кроме указанного на пробелы

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') # заменим пунктуацию на пробелы

    text = [word.lower() for word in text.split() if word.lower() not in sw] # удалим стоп слова

    text = [lemmatizer.lemmatize(word) for word in text]

    text = " ".join(text)

    return text

Применим функцию очистки к субтитрам

In [49]:
%time
big_df['srt_txt'] = big_df['srt_text'].apply(lambda x: clean_text(x))

CPU times: user 18 µs, sys: 1e+03 ns, total: 19 µs
Wall time: 23.8 µs


In [50]:
big_df.head()

Unnamed: 0,Movie,Kinopoisk,Level,Subtitles,short_name,file_name,srt_text,srt_txt
0,Forrest Gump,Rus sub,"A2/A2+, B1",Yes,forrestgump,Forrest_Gump(1994).srt,"[1\n00:03:19,349 --> 00:03:20,599\nHello.\n, 2...",hello name forrest forrest gump want chocolate...
1,Finding Nemo\n,Everything,A2/A2+,Yes,findingnemo,Finding_Nemo(2003).srt,"[1\n00:00:03,203 --> 00:00:05,194\n[Music play...",music playing advertise product brand br conta...
2,Cast away\n,"Paid, Rus sub",A2/A2+,Yes,castaway,Cast_away(2000).srt,"[1\n00:00:05,000 --> 00:00:15,000\nCreated and...",created encoded bokutox www yify torrent com b...
3,The invisible man (2020)\n,"Paid, Rus lan",A2/A2+,Yes,theinvisible,The_invisible_man(2020).srt,"[1\n00:02:56,135 --> 00:02:57,218\nAdrian?\n, ...",adrian come zeus sorry take sorry shit gonna l...
4,Back to the future\n,Rus sub,A2/A2+,Yes,backto,Back_to_the_future(1985).srt,"[0\n00:01:28,550 --> 00:01:30,259\nOctober is ...",october inventory time right statler toyota ma...


### Оставим для всех фильмов с несколькими уровнями сложности только один уровень сложности (высший)

In [51]:
big_df['Level'].isnull().sum()

0

In [52]:
big_df = big_df.dropna(subset=['Level'])

In [53]:
big_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86 entries, 0 to 87
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Movie       86 non-null     object
 1   Kinopoisk   69 non-null     object
 2   Level       86 non-null     object
 3   Subtitles   86 non-null     object
 4   short_name  86 non-null     object
 5   file_name   86 non-null     object
 6   srt_text    86 non-null     object
 7   srt_txt     86 non-null     object
dtypes: object(8)
memory usage: 6.0+ KB


In [54]:
big_df['Level'].unique()

array(['A2/A2+, B1', 'A2/A2+', 'B1', 'B1, B2', 'B2'], dtype=object)

In [55]:
big_df.loc[(big_df['Level'] == 'A2/A2+, B1'), 'Level'] = 0
big_df.loc[(big_df['Level'] == 'B1'), 'Level'] = 0
big_df.loc[(big_df['Level'] == 'A2/A2+'), 'Level'] = 2
big_df.loc[(big_df['Level'] == 'A2/A2+'), 'Level'] = 2
big_df.loc[(big_df['Level'] == 'B1, B2'), 'Level'] = 1
big_df.loc[(big_df['Level'] == 'B2'), 'Level'] = 1
big_df['Level'].unique()

array([0, 2, 1], dtype=object)

In [56]:
big_df['Level'].value_counts()

0    33
1    27
2    26
Name: Level, dtype: int64

In [57]:
big_df['Level'] = big_df['Level'].astype(int)

# Разобьем данные на тренировочные и тестовые

In [58]:
X_train, X_test , y_train, y_test = \
    train_test_split(big_df['srt_txt'].values,big_df['Level'].values,test_size=0.2,random_state=123,\
                     stratify=big_df['Level'].values)

In [59]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

In [60]:
tfidf_train_vectors.shape

(68, 18501)

In [61]:
tfidf_test_vectors.shape

(18, 18501)

#Используем модель XGBoostClassifier с GridSearchCV

In [62]:
%%time
xgb = XGBClassifier(random_state=1)
parametrs = {'eta' : (0.01, 0.1, 0.01),
             'max_depth' : (2, 10, 2),
             'n_estimators' : [32, 64, 128]}
grid_xgb = GridSearchCV(xgb, parametrs)
grid_xgb.fit(tfidf_train_vectors, y_train)
print('best_score =', grid_xgb.best_score_)
print('best params', grid_xgb.best_params_)

best_score = 0.5307692307692308
best params {'eta': 0.1, 'max_depth': 10, 'n_estimators': 32}
CPU times: user 7min 34s, sys: 1.11 s, total: 7min 35s
Wall time: 7min 39s


In [63]:
columns=['lemmatizer', 'model', 'parametrs', 'result']
XGBC_result = ['WordNet', 'XGBClassifier', grid_xgb.best_params_, grid_xgb.best_score_]
df_results = pd.DataFrame([XGBC_result], columns=columns)
df_results

Unnamed: 0,lemmatizer,model,parametrs,result
0,WordNet,XGBClassifier,"{'eta': 0.1, 'max_depth': 10, 'n_estimators': 32}",0.530769


#Используем модель CatBoostClassifier с GridSearchCV

In [64]:
%%time
clf = CatBoostClassifier()
parametrs = {'iterations' : [100],
             'learning_rate' : (0.01, 0.09, 0.02)
}
grid = GridSearchCV(clf, parametrs)
grid.fit(tfidf_train_vectors, y_train)
print('best_score =', grid.best_score_)
print('best params', grid.best_params_)

0:	learn: 1.0962034	total: 891ms	remaining: 1m 28s
1:	learn: 1.0944411	total: 1.65s	remaining: 1m 20s
2:	learn: 1.0931123	total: 2.37s	remaining: 1m 16s
3:	learn: 1.0910639	total: 3.07s	remaining: 1m 13s
4:	learn: 1.0888398	total: 3.98s	remaining: 1m 15s
5:	learn: 1.0866358	total: 5.34s	remaining: 1m 23s
6:	learn: 1.0844230	total: 6.5s	remaining: 1m 26s
7:	learn: 1.0817962	total: 7.7s	remaining: 1m 28s
8:	learn: 1.0799339	total: 8.88s	remaining: 1m 29s
9:	learn: 1.0779824	total: 9.96s	remaining: 1m 29s
10:	learn: 1.0754399	total: 11.1s	remaining: 1m 29s
11:	learn: 1.0738929	total: 12.1s	remaining: 1m 28s
12:	learn: 1.0721692	total: 13.1s	remaining: 1m 27s
13:	learn: 1.0702781	total: 15.2s	remaining: 1m 33s
14:	learn: 1.0689134	total: 17.3s	remaining: 1m 38s
15:	learn: 1.0672754	total: 19.2s	remaining: 1m 40s
16:	learn: 1.0662391	total: 20.3s	remaining: 1m 39s
17:	learn: 1.0639215	total: 21.4s	remaining: 1m 37s
18:	learn: 1.0623275	total: 22.5s	remaining: 1m 36s
19:	learn: 1.0601384	tot

In [65]:
cbc_result = ['WordNet', 'CatBoostClassifier', grid.best_params_, grid.best_score_]
df_results = pd.concat([df_results, pd.DataFrame([cbc_result], columns=columns)]).reset_index(drop=True)

In [66]:
df_results

Unnamed: 0,lemmatizer,model,parametrs,result
0,WordNet,XGBClassifier,"{'eta': 0.1, 'max_depth': 10, 'n_estimators': 32}",0.530769
1,WordNet,CatBoostClassifier,"{'iterations': 100, 'learning_rate': 0.09}",0.454945


#Используем модель RandomForestClassifier с GridSearchCV

In [67]:
%%time
rfc = RandomForestClassifier(random_state=123)
parametrs = {'n_estimators' : (30, 60, 3),
             'max_depth' : range(15, 25, 2),
            'min_samples_split' : range(2, 10, 2),
            'min_samples_leaf' : range(1, 10)}
grid_rfc = GridSearchCV(rfc, parametrs)
grid_rfc.fit(tfidf_train_vectors, y_train)
print('best_score =', grid_rfc.best_score_)
print('best params', grid_rfc.best_params_)

best_score = 0.5307692307692308
best params {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 60}
CPU times: user 3min 29s, sys: 1.05 s, total: 3min 30s
Wall time: 3min 31s


In [68]:
rfc_result = ['WordNet', 'RandomForestClassifier', grid_rfc.best_params_, grid_rfc.best_score_]
df_results = pd.concat([df_results, pd.DataFrame([rfc_result], columns=columns)]).reset_index(drop=True)

In [69]:
df_results

Unnamed: 0,lemmatizer,model,parametrs,result
0,WordNet,XGBClassifier,"{'eta': 0.1, 'max_depth': 10, 'n_estimators': 32}",0.530769
1,WordNet,CatBoostClassifier,"{'iterations': 100, 'learning_rate': 0.09}",0.454945
2,WordNet,RandomForestClassifier,"{'max_depth': 15, 'min_samples_leaf': 2, 'min_...",0.530769


# Попробуем SpaCy лемматизатор

In [70]:
load_model = spacy.load('en_core_web_sm', disable = ['parser','ner'])
en = spacy.load('en_core_web_sm')
sw_spacy = en.Defaults.stop_words

In [71]:
def clean_text_spacy(text):

    doc = load_model(text.text)

    empty_list = []
    for token in doc:
        empty_list.append(token.lemma_)

    final_string = ' '.join(map(str,empty_list))

    sub = final_string.lower()

    sub = re.sub(r"[^a-zA-Z]+", " ", sub)

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        sub = sub.replace(p,'')

    sub = [word.lower() for word in sub.split() if word.lower() not in sw_spacy]

    return sub

In [72]:
%%time
big_df['srt_txt_spacy'] = big_df['srt_text'].apply(lambda x: clean_text_spacy(x))

CPU times: user 1min 52s, sys: 13.7 s, total: 2min 6s
Wall time: 2min 7s


In [73]:
big_df.head()

Unnamed: 0,Movie,Kinopoisk,Level,Subtitles,short_name,file_name,srt_text,srt_txt,srt_txt_spacy
0,Forrest Gump,Rus sub,0,Yes,forrestgump,Forrest_Gump(1994).srt,"[1\n00:03:19,349 --> 00:03:20,599\nHello.\n, 2...",hello name forrest forrest gump want chocolate...,"[hello, s, forrest, forrest, gump, want, choco..."
1,Finding Nemo\n,Everything,2,Yes,findingnemo,Finding_Nemo(2003).srt,"[1\n00:00:03,203 --> 00:00:05,194\n[Music play...",music playing advertise product brand br conta...,"[music, playing, advertise, product, brand, br..."
2,Cast away\n,"Paid, Rus sub",2,Yes,castaway,Cast_away(2000).srt,"[1\n00:00:05,000 --> 00:00:15,000\nCreated and...",created encoded bokutox www yify torrent com b...,"[create, encode, bokutox, www, yify, torrents,..."
3,The invisible man (2020)\n,"Paid, Rus lan",2,Yes,theinvisible,The_invisible_man(2020).srt,"[1\n00:02:56,135 --> 00:02:57,218\nAdrian?\n, ...",adrian come zeus sorry take sorry shit gonna l...,"[adrian, come, zeus, sorry, sorry, shit, leave..."
4,Back to the future\n,Rus sub,2,Yes,backto,Back_to_the_future(1985).srt,"[0\n00:01:28,550 --> 00:01:30,259\nOctober is ...",october inventory time right statler toyota ma...,"[october, inventory, time, right, statler, toy..."


In [74]:
big_df['srt_txt_spacy'] = [''.join(map(str, l)) for l in big_df['srt_txt_spacy']]
big_df.head()

Unnamed: 0,Movie,Kinopoisk,Level,Subtitles,short_name,file_name,srt_text,srt_txt,srt_txt_spacy
0,Forrest Gump,Rus sub,0,Yes,forrestgump,Forrest_Gump(1994).srt,"[1\n00:03:19,349 --> 00:03:20,599\nHello.\n, 2...",hello name forrest forrest gump want chocolate...,hellosforrestforrestgumpwantchocolateeatmillio...
1,Finding Nemo\n,Everything,2,Yes,findingnemo,Finding_Nemo(2003).srt,"[1\n00:00:03,203 --> 00:00:05,194\n[Music play...",music playing advertise product brand br conta...,musicplayingadvertiseproductbrandbrcontactwwwo...
2,Cast away\n,"Paid, Rus sub",2,Yes,castaway,Cast_away(2000).srt,"[1\n00:00:05,000 --> 00:00:15,000\nCreated and...",created encoded bokutox www yify torrent com b...,createencodebokutoxwwwyifytorrentscomgoodppdmo...
3,The invisible man (2020)\n,"Paid, Rus lan",2,Yes,theinvisible,The_invisible_man(2020).srt,"[1\n00:02:56,135 --> 00:02:57,218\nAdrian?\n, ...",adrian come zeus sorry take sorry shit gonna l...,adriancomezeussorrysorryshitleavethingokaycome...
4,Back to the future\n,Rus sub,2,Yes,backto,Back_to_the_future(1985).srt,"[0\n00:01:28,550 --> 00:01:30,259\nOctober is ...",october inventory time right statler toyota ma...,octoberinventorytimerightstatlertoyotagooddeal...


In [75]:
%%time
X_train_spacy, X_test_spacy, y_train, y_test = \
    train_test_split(big_df['srt_txt_spacy'],big_df['Level'], test_size=0.2,random_state=123,\
                     stratify=big_df['Level'])

CPU times: user 3.66 ms, sys: 0 ns, total: 3.66 ms
Wall time: 6.7 ms


In [76]:
X_train_spacy

38    ladygentlemanbegindescentlosangelessoundhearla...
23    peoplechatteringphoneringingmananswerphonewoma...
54    meterokaybowrailokaymirbowstayokayquietrollcom...
51    timehiddenheartfrancehandsomeyoungprincelivebe...
75    inkspotcarecarewordcarefeelwaylovethrillheadro...
                            ...                        
0     hellosforrestforrestgumpwantchocolateeatmillio...
4     octoberinventorytimerightstatlertoyotagooddeal...
3     adriancomezeussorrysorryshitleavethingokaycome...
14    rachelletfinderskeeperfindgardenfinderskeeperb...
33    watchmoviesserieslivetvbrwwwflixifyappfemalewh...
Name: srt_txt_spacy, Length: 68, dtype: object

In [77]:
%%time
def dummy(doc):
    return doc

tfidf_vectorizer_spacy = TfidfVectorizer(tokenizer=dummy, preprocessor=dummy)
tfidf_train_vectors_spacy = tfidf_vectorizer_spacy.fit_transform(X_train_spacy)
tfidf_test_vectors_spacy = tfidf_vectorizer_spacy.transform(X_test_spacy)



CPU times: user 506 ms, sys: 587 µs, total: 506 ms
Wall time: 539 ms


In [78]:
tfidf_train_vectors.shape

(68, 18501)

In [79]:
tfidf_train_vectors

<68x18501 sparse matrix of type '<class 'numpy.float64'>'
	with 92738 stored elements in Compressed Sparse Row format>

In [80]:
tfidf_test_vectors.shape

(18, 18501)

##Используем модель XGBoostClassifier с GridSearchCV

In [81]:
%%time
xgb_s = XGBClassifier(random_state=1)
parametrs = {'eta' : (0.01, 0.1, 0.01),
             'max_depth' : (2, 10, 2),
             'n_estimators' : [32, 64, 128]}
grid_xgb_s = GridSearchCV(xgb_s, parametrs)
grid_xgb_s.fit(tfidf_train_vectors_spacy, y_train)
print('best_score =', grid_xgb_s.best_score_)
print('best params', grid_xgb_s.best_params_)

best_score = 0.45494505494505494
best params {'eta': 0.1, 'max_depth': 2, 'n_estimators': 128}
CPU times: user 8.1 s, sys: 357 ms, total: 8.45 s
Wall time: 8.51 s


In [82]:
xgbc_s_result = ['Spacy', 'XGBClassifier', grid_xgb_s.best_params_, grid_xgb_s.best_score_]
df_results = pd.concat([df_results, pd.DataFrame([xgbc_s_result], columns=columns)]).reset_index(drop=True)

In [83]:
df_results

Unnamed: 0,lemmatizer,model,parametrs,result
0,WordNet,XGBClassifier,"{'eta': 0.1, 'max_depth': 10, 'n_estimators': 32}",0.530769
1,WordNet,CatBoostClassifier,"{'iterations': 100, 'learning_rate': 0.09}",0.454945
2,WordNet,RandomForestClassifier,"{'max_depth': 15, 'min_samples_leaf': 2, 'min_...",0.530769
3,Spacy,XGBClassifier,"{'eta': 0.1, 'max_depth': 2, 'n_estimators': 128}",0.454945


##Используем модель CatBoostClassifier с GridSearchCV

In [84]:
%%time
clf_s = CatBoostClassifier()
parametrs = {'iterations' : [100],
             'learning_rate' : (0.01, 0.09, 0.02)
}
grid_s = GridSearchCV(clf_s, parametrs)
grid_s.fit(tfidf_train_vectors, y_train)
print('best_score =', grid_s.best_score_)
print('best params', grid_s.best_params_)

0:	learn: 1.0962034	total: 471ms	remaining: 46.6s
1:	learn: 1.0944411	total: 963ms	remaining: 47.2s
2:	learn: 1.0931123	total: 1.41s	remaining: 45.7s
3:	learn: 1.0910639	total: 2.23s	remaining: 53.5s
4:	learn: 1.0888398	total: 3.04s	remaining: 57.7s
5:	learn: 1.0866358	total: 3.81s	remaining: 59.7s
6:	learn: 1.0844230	total: 4.61s	remaining: 1m 1s
7:	learn: 1.0817962	total: 5.53s	remaining: 1m 3s
8:	learn: 1.0799339	total: 7.09s	remaining: 1m 11s
9:	learn: 1.0779824	total: 8.41s	remaining: 1m 15s
10:	learn: 1.0754399	total: 9.68s	remaining: 1m 18s
11:	learn: 1.0738929	total: 10.8s	remaining: 1m 18s
12:	learn: 1.0721692	total: 11.9s	remaining: 1m 19s
13:	learn: 1.0702781	total: 13s	remaining: 1m 19s
14:	learn: 1.0689134	total: 14s	remaining: 1m 19s
15:	learn: 1.0672754	total: 15.1s	remaining: 1m 19s
16:	learn: 1.0662391	total: 16.1s	remaining: 1m 18s
17:	learn: 1.0639215	total: 17.5s	remaining: 1m 19s
18:	learn: 1.0623275	total: 19.3s	remaining: 1m 22s
19:	learn: 1.0601384	total: 21.5s	

In [85]:
cbc_s_result = ['Spacy', 'CatBoostClassifier', grid_s.best_params_, grid_s.best_score_]
df_results = pd.concat([df_results, pd.DataFrame([cbc_s_result], columns=columns)]).reset_index(drop=True)

In [86]:
df_results

Unnamed: 0,lemmatizer,model,parametrs,result
0,WordNet,XGBClassifier,"{'eta': 0.1, 'max_depth': 10, 'n_estimators': 32}",0.530769
1,WordNet,CatBoostClassifier,"{'iterations': 100, 'learning_rate': 0.09}",0.454945
2,WordNet,RandomForestClassifier,"{'max_depth': 15, 'min_samples_leaf': 2, 'min_...",0.530769
3,Spacy,XGBClassifier,"{'eta': 0.1, 'max_depth': 2, 'n_estimators': 128}",0.454945
4,Spacy,CatBoostClassifier,"{'iterations': 100, 'learning_rate': 0.09}",0.454945


##Используем модель RandomForestClassifier с GridSearchCV

In [87]:
%%time
rfc_s = RandomForestClassifier(random_state=123)
parametrs = {'n_estimators' : (30, 60, 3),
             'max_depth' : range(15, 25, 2),
            'min_samples_split' : range(2, 10, 2),
            'min_samples_leaf' : range(1, 10)}
grid_rfc_s = GridSearchCV(rfc_s, parametrs)
grid_rfc_s.fit(tfidf_train_vectors, y_train)
print('best_score =', grid_rfc_s.best_score_)
print('best params', grid_rfc_s.best_params_)

best_score = 0.5307692307692308
best params {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 60}
CPU times: user 3min 38s, sys: 1.03 s, total: 3min 39s
Wall time: 3min 41s


In [88]:
rfc_s_result = ['Spacy', 'RandomForestClassifier', grid_rfc_s.best_params_, grid_rfc_s.best_score_]
df_results = pd.concat([df_results, pd.DataFrame([rfc_s_result], columns=columns)]).reset_index(drop=True)

In [89]:
df_results

Unnamed: 0,lemmatizer,model,parametrs,result
0,WordNet,XGBClassifier,"{'eta': 0.1, 'max_depth': 10, 'n_estimators': 32}",0.530769
1,WordNet,CatBoostClassifier,"{'iterations': 100, 'learning_rate': 0.09}",0.454945
2,WordNet,RandomForestClassifier,"{'max_depth': 15, 'min_samples_leaf': 2, 'min_...",0.530769
3,Spacy,XGBClassifier,"{'eta': 0.1, 'max_depth': 2, 'n_estimators': 128}",0.454945
4,Spacy,CatBoostClassifier,"{'iterations': 100, 'learning_rate': 0.09}",0.454945
5,Spacy,RandomForestClassifier,"{'max_depth': 15, 'min_samples_leaf': 2, 'min_...",0.530769


# Тестирование модели
Протестируем модель RandomForestClassifier с использованием лемматизатора Spacy

In [90]:
best_model = RandomForestClassifier(random_state=123, max_depth=15,
                                    min_samples_leaf=2, min_samples_split=8, n_estimators=60)

In [91]:
best_model.fit(tfidf_train_vectors, y_train)

In [92]:
best_model.score(tfidf_test_vectors, y_test)

0.2777777777777778

# Вывод:
Результат лучшей модели на тестовой выборке получился 0.277, что является плохим результатом. Пользоваться таким классификатором не имеет смысла, так как его предсказания ниже случайного выбора (у нас 3 класса).
Для создания лучшего классификатора можно попробовать увеличить датасет, собрав из интернета фильмы с субтятрами с указанными уровнями сложности, и для векторизации субтитров можно использовать предобученную модель Bert.