Задача:
Разработать ML решение для автоматического определения уровня сложности англоязычных фильмов.

Цель:
Получить метрику качества не ниже 0.6.

In [1]:
import numpy as np
import gensim
from gensim.models import Word2Vec

from nltk.tokenize import RegexpTokenizer

import spacy
from string import punctuation

import os
import pandas as pd
import srt
from chardet import detect

import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics

RANDOM_STATE = 10
dir_path = "./Subtitles_all"
df_excel_path = "./movies_labels.xlsx"
word2vec_path = "./GoogleNews-vectors-negative300.bin.gz"

Загрузим файлы и проведем их перекодировку

In [2]:
df_subtitles = pd.DataFrame([], columns=["movie", "subs_raw"])

for root, dir, files in os.walk(dir_path):
    for file_name in files:
        if file_name.endswith(".srt"):
            file_path = os.path.join(root, file_name)

            with open(file_path, "rb") as f:
                data = f.read()
                f_charInfo = detect(data)["encoding"]
                coding = str(f_charInfo)

                with open(file_path, "r", encoding=coding) as sourceFile:
                    contents = sourceFile.read()
                    subtitle_generator = srt.parse(contents)
                    subtitles = list(subtitle_generator)
                    subtitles_content = map(lambda x: x.content, subtitles)

                    subs = " ".join(subtitles_content)
                    movie = file_name[:-4]
                    df_subs = pd.DataFrame({"movie": [movie], "subs_raw": [subs]})
                    df_subtitles = pd.concat([df_subtitles, df_subs], ignore_index=True)

In [3]:
df_subtitles

Unnamed: 0,movie,subs_raw
0,"Crown, The S01E01 - Wolferton Splash.en","In seeking his British\nnationalization, His R..."
1,Suits.Episode 1- Denial,You're the most amazing woman\nI have ever met...
2,Crazy4TV.com - Suits.S06E06.720p.BluRay.x265.H...,(HARVEY READING) I've been after Sutter\nfor t...
3,Suits.S02E08.HDTV.x264-EVOLVE,[Car horn blares] You're late. Nope. 30 second...
4,Virgin.River.S01E07.INTERNAL.720p.WEB.x264-STRiFE,Are you sure I can't convince you to stay? No....
...,...,...
273,Matilda(1996),"<i><font color=""#808080"">(MUSIC)</font></i> <i..."
274,Her(2013),Advertise your product or brand here\ncontact ...
275,The_Fundamentals_of_Caring(2016),Caregiving is not just\nabout feeding and clot...
276,The_Intern(2015),"Freud said, ""love and work.\nWork and love. Th..."


Прочитаем файл с данными о категориях фильмов

In [235]:
df_excel = pd.read_excel(df_excel_path)
df_excel.columns = df_excel.columns.str.lower()
df_excel.drop(columns=["id"], axis=1, inplace=True)

In [236]:
df_excel

Unnamed: 0,movie,level
0,10_Cloverfield_lane(2016),B1
1,10_things_I_hate_about_you(1999),B1
2,A_knights_tale(2001),B2
3,A_star_is_born(2018),B2
4,Aladdin(1992),A2/A2+
...,...,...
236,Matilda(2022),C1
237,Bullet train,B1
238,Thor: love and thunder,B2
239,Lightyear,B2


Соединим обе таблицы в одну по столбцу movie

In [238]:
data = df_subtitles.merge(df_excel, how="inner", on=["movie"])
data = data[["movie", "level", "subs_raw"]]
data

Unnamed: 0,movie,level,subs_raw
0,Suits.Episode 1- Denial,B2,You're the most amazing woman\nI have ever met...
1,Crazy4TV.com - Suits.S06E06.720p.BluRay.x265.H...,B2,(HARVEY READING) I've been after Sutter\nfor t...
2,Suits.S02E08.HDTV.x264-EVOLVE,B2,[Car horn blares] You're late. Nope. 30 second...
3,Suits.S02E04.HDTV.x264-ASAP,B2,"I want to, uh... taupe. - Is that...?\n- Justi..."
4,Suits.S02E09.HDTV.x264-ASAP,B2,"Donna... Donna. You know, in all the years\nth..."
...,...,...,...
228,Matilda(1996),B1,"<i><font color=""#808080"">(MUSIC)</font></i> <i..."
229,Her(2013),"A2/A2+, B1",Advertise your product or brand here\ncontact ...
230,The_Fundamentals_of_Caring(2016),B1,Caregiving is not just\nabout feeding and clot...
231,The_Intern(2015),B2,"Freud said, ""love and work.\nWork and love. Th..."


Проведем очистку текста от ненужных символов

In [239]:
del_n = re.compile("\n")  # перенос каретки
del_tags = re.compile("<[^>]*>")  # html-теги (с содержимым)
del_brackets = re.compile(
    "\([^)]*\)|\[[^]]*\]"
)  # содержимое круглых и квадратных скобок
# clean_text = re.compile('[^а-яa-z\'\s]')          # все небуквенные символы кроме пробела и апострофа (')
del_spaces = re.compile("\s{2,}")  # лишние символы пробелов/табуляции

def prepare_text(text):
    text = del_n.sub(" ", str(text).lower())
    text = del_tags.sub("", text)
    text = del_brackets.sub("", text)
    # text = clean_text.sub('', text)

    return del_spaces.sub(" ", text)

In [240]:
data_std = data.copy()
data_std["subs"] = data_std["subs_raw"].apply(lambda x: prepare_text(x))

In [241]:
data_std

Unnamed: 0,movie,level,subs_raw,subs
0,Suits.Episode 1- Denial,B2,You're the most amazing woman\nI have ever met...,you're the most amazing woman i have ever met....
1,Crazy4TV.com - Suits.S06E06.720p.BluRay.x265.H...,B2,(HARVEY READING) I've been after Sutter\nfor t...,i've been after sutter for three years now. t...
2,Suits.S02E08.HDTV.x264-EVOLVE,B2,[Car horn blares] You're late. Nope. 30 second...,you're late. nope. 30 seconds early. good. le...
3,Suits.S02E04.HDTV.x264-ASAP,B2,"I want to, uh... taupe. - Is that...?\n- Justi...","i want to, uh... taupe. - is that...? - justic..."
4,Suits.S02E09.HDTV.x264-ASAP,B2,"Donna... Donna. You know, in all the years\nth...","donna... donna. you know, in all the years tha..."
...,...,...,...,...
228,Matilda(1996),B1,"<i><font color=""#808080"">(MUSIC)</font></i> <i...","narrator: everyone is born, but not everyone ..."
229,Her(2013),"A2/A2+, B1",Advertise your product or brand here\ncontact ...,advertise your product or brand here contact w...
230,The_Fundamentals_of_Caring(2016),B1,Caregiving is not just\nabout feeding and clot...,caregiving is not just about feeding and cloth...
231,The_Intern(2015),B2,"Freud said, ""love and work.\nWork and love. Th...","freud said, ""love and work. work and love. tha..."


Посмотрим на распределение категорий

In [242]:
data_std.level.value_counts()

level
B2            97
B1            53
C1            39
A2/A2+        25
B1, B2         8
A2             6
A2/A2+, B1     5
Name: count, dtype: int64

Объединим малочисленные категории с основными

In [243]:
data_std.loc[data_std["level"] == "A2/A2+", "level"] = "A2"
data_std.loc[data_std["level"] == "A2/A2+, B1", "level"] = "A2"
data_std.loc[data_std["level"] == "B1, B2", "level"] = "B1"
data_std.level.value_counts()

level
B2    97
B1    61
C1    39
A2    36
Name: count, dtype: int64

Попробуем обучить модель с помощью Word2Vec

Разбиваем текст на токены

In [244]:
tokenizer = RegexpTokenizer(r'\w+')

data_std["subs_tokens"] = data_std["subs"].apply(tokenizer.tokenize)
data_std.head()

Unnamed: 0,movie,level,subs_raw,subs,subs_tokens
0,Suits.Episode 1- Denial,B2,You're the most amazing woman\nI have ever met...,you're the most amazing woman i have ever met....,"[you, re, the, most, amazing, woman, i, have, ..."
1,Crazy4TV.com - Suits.S06E06.720p.BluRay.x265.H...,B2,(HARVEY READING) I've been after Sutter\nfor t...,i've been after sutter for three years now. t...,"[i, ve, been, after, sutter, for, three, years..."
2,Suits.S02E08.HDTV.x264-EVOLVE,B2,[Car horn blares] You're late. Nope. 30 second...,you're late. nope. 30 seconds early. good. le...,"[you, re, late, nope, 30, seconds, early, good..."
3,Suits.S02E04.HDTV.x264-ASAP,B2,"I want to, uh... taupe. - Is that...?\n- Justi...","i want to, uh... taupe. - is that...? - justic...","[i, want, to, uh, taupe, is, that, justice, th..."
4,Suits.S02E09.HDTV.x264-ASAP,B2,"Donna... Donna. You know, in all the years\nth...","donna... donna. you know, in all the years tha...","[donna, donna, you, know, in, all, the, years,..."


Скачиваем модель Word2Vec

In [245]:
# Load Google's pre-trained Word2Vec model.
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [246]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list) < 1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [
            vector[word] if word in vector else np.random.rand(k)
            for word in tokens_list
        ]
    else:
        vectorized = [
            vector[word] if word in vector else np.zeros(k) for word in tokens_list
        ]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged


def get_word2vec_embeddings(vectors, data_std, generate_missing=False):
    embeddings = data_std["subs_tokens"].apply(
        lambda x: get_average_word2vec(x, vectors, generate_missing=generate_missing)
    )
    return list(embeddings)

In [247]:


embeddings = get_word2vec_embeddings(word2vec, data_std)
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(
    embeddings, data_std["level"], test_size=0.2, random_state=RANDOM_STATE
)

In [248]:
model_w2v = LogisticRegression()
model_w2v.fit(X_train_word2vec, y_train_word2vec)
y_predicted_word2vec = model_w2v.predict(X_test_word2vec)

In [249]:
accuracy = metrics.accuracy_score(y_test_word2vec, y_predicted_word2vec)
accuracy

0.48936170212765956

Теперь попробуем обработать данные с помощью библиотеки SpaCy

---

In [250]:
X = data_std["subs"]
ylabels = data_std["level"]

X_train, X_test, y_train, y_test = train_test_split(
    X, ylabels, test_size=0.2, random_state=RANDOM_STATE
)

Удалим стоп-слова и пробелы между строками, проведем лемматизацию данных.

In [251]:
nlp = spacy.load("en_core_web_sm")

stop_words = spacy.lang.en.stop_words.STOP_WORDS

def spacy_tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [
        word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_
        for word in mytokens
    ]
    mytokens = [
        word for word in mytokens if word not in stop_words and word not in punctuation
    ]

    return mytokens

Создадим пайплайн для векторизации данных и модели LogisticRegression

In [252]:
pipelines = [
    (
        "tfidf_vectorizer",
        TfidfVectorizer(tokenizer=spacy_tokenizer, token_pattern=None),
    ),
    (
        "classifier",
        LogisticRegression(),
    ),
]

In [253]:
pipe = Pipeline(pipelines)
pipe.fit(X_train, y_train)
predicted = pipe.predict(X_test)

In [254]:
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, predicted))

Logistic Regression Accuracy: 0.6808510638297872


В проекте рассматривались 2 подхода к обработке даных - с помощью библиотеки SpaCy и готовой модели библиотеки Gensim. Наилучший показатель дал первый подход. С его пмощью удалось добиться значения метрики - 0.68.