In [None]:
!pip install pysrt



In [None]:
import re;
import pandas as pd;
import numpy as np;
import pysrt;
import nltk;
from pathlib import Path as path;
from sklearn.feature_extraction.text import CountVectorizer;
from sklearn.feature_extraction.text import TfidfTransformer;
from nltk.stem.porter import PorterStemmer;
from nltk.corpus import stopwords;
from sklearn.feature_extraction.text import TfidfVectorizer;
from sklearn.pipeline import Pipeline;
from sklearn.linear_model import LogisticRegression;
from sklearn.model_selection import GridSearchCV;
from sklearn.model_selection import train_test_split;

nltk.download('stopwords')

### 1. Готовим данные.

##### Проверяем данные.

In [None]:
language_complexity_df = pd.read_excel('./datasets/movies_labels.xlsx').iloc[:, 1:]
display(language_complexity_df.head(10))

##### Создаем таблицу с субтитрами и названиями фильмов.

In [None]:
regex = re.compile('[0-9$\#\[\],\.*<>;!?-]')

In [None]:
folder = path('./datasets/subtitles')
subtitle_name_map = []

for file in folder.iterdir():
    movie_name = file.stem;
    subtitle = regex.sub('', 
                         pysrt.open('./datasets/subtitles/'+ file.stem + '.srt', encoding='iso-8859-1').text.lower())
    subtitle_name_map.append([movie_name, subtitle])
    
subtitles_df = pd.DataFrame(data=subtitle_name_map, columns=['Movie', 'Subtitles'])

##### Объединяем таблицы.

In [None]:
df = pd.merge(language_complexity_df, subtitles_df, on="Movie")
df = df.drop('Movie', axis=1)

In [None]:
display(df.head(10))

In [None]:
# решить проблему с A2/A2+ и тп.

### 2. Обучаем модель.

In [None]:
# Используем функции и стоп слова из nlp семинара.
def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

stop = stopwords.words('english')

In [None]:
df_features = df.drop('Subtitles', axis=1)
df_target = df['Subtitles']

In [None]:
# features_train, features_valid, target_train, target_valid = train_test_split(
#     df_features,
#     df_target,
#     train_size=0.75,
#     test_size=0.25)
features_train = df.loc[:174, 'Subtitles'].values
target_train = df.loc[:174, 'Level'].values
features_valid = df.loc[174:, 'Subtitles'].values
target_valid = df.loc[174:, 'Level'].values

In [None]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [None]:
print(len(features_train))
print(len(target_train))
gs_lr_tfidf.fit(features_train, target_train)

In [None]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

In [None]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(features_valid, target_valid))