In [1]:
!pip install pysrt



In [2]:
import re;
import pandas as pd;
import numpy as np;
import pysrt;
import nltk;
from pathlib import Path as path;
from sklearn.feature_extraction.text import CountVectorizer;
from sklearn.feature_extraction.text import TfidfTransformer;
from nltk.stem.porter import PorterStemmer;
from nltk.corpus import stopwords;
from sklearn.feature_extraction.text import TfidfVectorizer;
from sklearn.pipeline import Pipeline;
from sklearn.linear_model import LogisticRegression;
from sklearn.model_selection import GridSearchCV;
from sklearn.model_selection import train_test_split;

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 1. Готовим данные.

##### Проверяем данные.

In [3]:
language_complexity_df = pd.read_excel('./datasets/movies_labels.xlsx').iloc[:, 1:]
display(language_complexity_df.head(10))

Unnamed: 0,Movie,Level
0,10_Cloverfield_lane(2016),B1
1,10_things_I_hate_about_you(1999),B1
2,A_knights_tale(2001),B2
3,A_star_is_born(2018),B2
4,Aladdin(1992),A2/A2+
5,All_dogs_go_to_heaven(1989),A2/A2+
6,An_American_tail(1986),A2/A2+
7,Babe(1995),A2/A2+
8,Back_to_the_future(1985),A2/A2+
9,Banking_On_Bitcoin(2016),C1


##### Создаем таблицу с субтитрами и названиями фильмов.

In [4]:
regex = re.compile('[0-9$\#\[\],\.*<>;!?-]')

In [5]:
folder = path('./datasets/subtitles')
subtitle_name_map = []

for file in folder.iterdir():
    movie_name = file.stem;
    subtitle = regex.sub('', 
                         pysrt.open('./datasets/subtitles/'+ file.stem + '.srt', encoding='iso-8859-1').text.lower())
    subtitle_name_map.append([movie_name, subtitle])
    
subtitles_df = pd.DataFrame(data=subtitle_name_map, columns=['Movie', 'Subtitles'])

##### Объединяем таблицы.

In [6]:
df = pd.merge(language_complexity_df, subtitles_df, on="Movie")
df = df.drop('Movie', axis=1)

In [7]:
display(df.head(10))

Unnamed: 0,Level,Subtitles
0,B1,"font color=""ffff""bfixed & synced by bozxphd en..."
1,B1,hey\ni'll be right with you\nso cameron here y...
2,B2,resync: xenzainef\nretail\nshould we help him\...
3,B2,"ifont color=""ffffff"" synced and corrected by/..."
4,A2/A2+,ioh i come from a land\nfrom a faraway place/i...
5,A2/A2+,captioning made possible by\nmgm home entertai...
6,A2/A2+,(indistinct conversation)\n(all laughing)\nmam...
7,A2/A2+,this is a tale aboutbr/an unprejudiced heart\n...
8,A2/A2+,october is inventory time\nso right now statle...
9,C1,downloaded from\nytsmx\nofficial yify movies s...


In [8]:
# решить проблему с A2/A2+ и тп.

### 2. Обучаем модель.

In [9]:
# Используем функции и стоп слова из nlp семинара.
def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

stop = stopwords.words('english')

In [10]:
df_features = df.drop('Subtitles', axis=1)
df_target = df['Subtitles']

In [11]:
# features_train, features_valid, target_train, target_valid = train_test_split(
#     df_features,
#     df_target,
#     train_size=0.75,
#     test_size=0.25)
features_train = df.loc[:174, 'Subtitles'].values
target_train = df.loc[:174, 'Level'].values
features_valid = df.loc[174:, 'Subtitles'].values
target_valid = df.loc[174:, 'Level'].values

In [12]:
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [None]:
gs_lr_tfidf.fit(features_train, target_train)

175
175
Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [None]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

In [None]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(features_valid, target_valid))