### Подготовка текста субтитров

In [None]:
import pysrt
import pandas as pd
import re
import seaborn as sns

In [None]:
# загрузка списка файлов субтитров, удаление ненужных строк, удаление дупликатов, приведение к типам

df_loaded = pd.read_csv('fil_sub_download_temp_file.csv', sep=';', names=['imdb_id','moviename', 'data','file'])
df_loaded.drop(df_loaded[df_loaded.moviename == 'ERROR'].index, inplace=True)
df_loaded = df_loaded.drop_duplicates(subset='imdb_id')
df_loaded.insert(4,'subs_text','')
df_loaded = df_loaded.astype({'imdb_id': 'int32','moviename': 'string','data': 'object','file': 'object','subs_text': 'string'})
df_loaded = df_loaded[['imdb_id','subs_text','file']]
df_loaded = df_loaded.reset_index(drop=True)
df_loaded

In [None]:
# удаление "кривых субтитров"
df_loaded = df_loaded.drop(df_loaded[df_loaded.file.str.contains('1038685.srt')].index)
df_loaded

In [None]:
def clean_text(text):

    # очистка текста субтитров от ненужных символов

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'em", " them", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)

    text = re.sub(r"<i>", " ", text)
    text = re.sub(r"</i>", " ", text)
    text = text.replace('\n',' ')
    text = text.replace('   ',' ')
    text = text.replace('  ',' ')

    text = re.sub(r"[-()\"#/@;§:<>{}`+=~|.!?,]", "", text)

    return text

In [None]:
def load_subs(path):

    # пакетная очистка субтитров от ненужных символов

    with open(path, 'r') as sf:

        lines = sf.readlines()

        if re.search(r'{\d+}', lines[0]) is not None:
            del lines[0]
            del lines[-1]
            lines = ''.join(lines)
            subs_text = re.sub(r'{\d+}','',lines)
        elif re.search(r'\[\d+\]', lines[0]) is not None:
            del lines[0]
            del lines[-1]
            lines = ''.join(lines)
            subs_text = re.sub(r'\[\d+\]','',lines)
        else:
            subs = pysrt.open(path)
            print(path)
            del subs[0]
            del subs[-1]
            subs_text = subs.text
            
    subs_text = clean_text(subs_text)
            
    return subs_text

In [None]:
# загрузка субтитров из файлов в датафрейм

for imdb_id, sub in zip(df_loaded.imdb_id, df_loaded.file):
    path, = eval(sub).values()
    df_loaded.loc[df_loaded.imdb_id == imdb_id,'subs_text'] = load_subs(path)

df_subs = df_loaded[['imdb_id','subs_text']].copy(deep=True)

# df_subs[df_subs['subs_text'].str.contains('OpenSubtitles')]

In [None]:
# подчёт слов в субтитрах, фильтрация по кол-ву слов не более 15000

df_subs['subs_len'] = df_subs['subs_text'].apply(lambda x: len(x.split()))
df_subs = df_subs[df_subs['subs_len'] <= 15000]
df_subs = df_subs.set_index('imdb_id')
df_subs.sort_values(by='subs_len')

In [None]:
sns.histplot(data=df_subs['subs_len'])

In [None]:
df_kw = pd.read_csv('df_keylist_fifteen.csv', sep=';', index_col='imdb_id')
df_kw

In [None]:
df = df_kw.join(df_subs['subs_text'])

In [None]:
df

In [None]:
df.to_csv('df_kw_subs.csv', sep=';')