In [1]:
import pandas as pd
import codecs
import os
import re

try:
    from pypdf import PdfReader
except:
    !pip install pypdf
    from pypdf import PdfReader

try:
    import spacy
except:
    !pip install spacy
    import spacy


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
SCORES_PATH    = 'English_level/English_scores'
SUBTITLES_PATH = 'English_level/English_scores/Subtitles_all'
OXFORD_PATH    = 'English_level/Oxford_CEFR_level'

In [4]:
ENGLISH_LEVELS = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']

# LOAD DATA

## The Oxford core by CEFR level

In [5]:
# load all pdfs and split to lines
content = []

for dirname, _, filenames in os.walk(OXFORD_PATH):
    for filename in filenames:
        print(filename)
        reader = PdfReader(f'{OXFORD_PATH}/{filename}')
        for page in reader.pages:
            content += page.extract_text().splitlines()

# parse pdf content to words by levels
oxford_levels = {key:[] for key in ENGLISH_LEVELS}
oxford_levels['A1'] = ['a', 'an']
current_level = ''

for line in content:
    line = line.strip()
    
    if 'Oxford' in line or 'English' in line:
        pass
    elif line in oxford_levels:
        current_level = line
    elif ' ' in line:
        line = re.sub(r'\d|,', '', line.lower())
        oxford_levels[current_level] += [line.split()[0]]
    else:
        pass

# sorted unique words
print('\nOxford dictionary (English + American)')
for level in oxford_levels:
    oxford_levels[level] = list(sorted(set(oxford_levels[level])))
    print(f'\t{level} words count {len(oxford_levels[level]):>5}')  

# convert dictionary to dataframe
df_oxford_levels = pd.DataFrame(columns=['word', 'level'])
for level in oxford_levels:
    df_oxford_levels = pd.concat([df_oxford_levels,
                                  pd.DataFrame({'word' : oxford_levels[level],
                                                'level': level})
                                 ])
df_oxford_levels

American_Oxford_3000_by_CEFR_level.pdf
The_Oxford_3000_by_CEFR_level.pdf
The_Oxford_5000_by_CEFR_level.pdf
American_Oxford_5000_by_CEFR_level.pdf

Oxford dictionary (English + American)
	A1 words count   917
	A2 words count   898
	B1 words count   827
	B2 words count  1482
	C1 words count  1198
	C2 words count     0


Unnamed: 0,word,level
0,a,A1
1,about,A1
2,above,A1
3,across,A1
4,action,A1
...,...,...
1193,worthwhile,C1
1194,worthy,C1
1195,yell,C1
1196,yield,C1


## Subtitles

In [6]:
# parse and clean subtitles
def clean_text(content):
    text = []
    for line in content.splitlines():                        
        if re.search(r'[A-Za-z]',line):                      # get non empty lines with words
            line = line.lower()                              # transform to lowercase
            line = re.sub(r'(\(.+\))|(\[.+\])', ' ', line)   # remove actions in parenthesis
            line = re.sub(r'([\w#\s]+\:)', ' ', line)        # remove speaker in dialogs
            line = re.sub(r'[^a-z]', ' ', line)              # remove non-word symbols
            line = re.sub(r'\b\w{1,2}\b', '', line)          # remove single and two symbol sequences
            line = re.sub(r'\s\s+', ' ', line).strip()       # remove extra spaces longer than single
            if len(line)>0:
                text.append(line)
    return ' '.join(text)


# process file
def process_srt(dirname, filename):
    if not filename.endswith('.srt'):                        # skip non srt files
        print('посторонний файл', filename)
        return False
    fullpath = os.path.join(dirname,filename)
    with codecs.open(fullpath, 'r',                          # try read file
                     encoding='utf-8', 
                     errors='ignore') as file:
        try:
            content = file.read()
        except:
            print('не прочиталось', fullpath)
            return False
    return clean_text(content)                               # clean text and return


movies = pd.DataFrame(columns=['movie', 'filename', 'content', 'level', 'status'])


# recursive walk through dirs
for dirname, _, filenames in os.walk(SUBTITLES_PATH):
    for filename in filenames:
        level = dirname.split('/')[-1]                       # get level from dir name
        subs = process_srt(dirname, filename)                # process file
        if subs:                                             # add movie to dataframe
            movies.loc[len(movies)] = \
                {'filename': filename.replace('.srt', ''),
                 'content' : subs,
                 'level'   : level
                }
            
            
movies

посторонний файл .DS_Store
посторонний файл .DS_Store
посторонний файл .DS_Store


Unnamed: 0,movie,filename,content,level,status
0,,"Crown, The S01E01 - Wolferton Splash.en",seeking his british nationalization his royal ...,B2,
1,,Suits.Episode 1- Denial,you the most amazing woman have ever met are y...,B2,
2,,Crazy4TV.com - Suits.S06E06.720p.BluRay.x265.H...,been after sutter for three years now this guy...,B2,
3,,Suits.S02E08.HDTV.x264-EVOLVE,you late nope seconds early good learning tell...,B2,
4,,Virgin.River.S01E07.INTERNAL.720p.WEB.x264-STRiFE,are you sure can convince you stay you were ri...,B2,
...,...,...,...,...,...
257,,Matilda(1996),font color font font color font everyone born ...,Subtitles,
258,,Her(2013),advertise your product brand here contact www ...,Subtitles,
259,,The_Fundamentals_of_Caring(2016),caregiving not just about feeding and clothing...,Subtitles,
260,,The_Intern(2015),freud said love and work work and love that al...,Subtitles,


In [7]:
movie_labels = pd.read_excel(f'{SCORES_PATH}/movies_labels.xlsx')
movie_labels.info()
print('\nДубликаты полные', movie_labels.duplicated().sum())
print('Дубликаты в названиях фильмов', movie_labels.Movie.duplicated().sum())
movie_labels


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      241 non-null    int64 
 1   Movie   241 non-null    object
 2   Level   241 non-null    object
dtypes: int64(1), object(2)
memory usage: 5.8+ KB

Дубликаты полные 0
Дубликаты в названиях фильмов 4


Unnamed: 0,id,Movie,Level
0,0,10_Cloverfield_lane(2016),B1
1,1,10_things_I_hate_about_you(1999),B1
2,2,A_knights_tale(2001),B2
3,3,A_star_is_born(2018),B2
4,4,Aladdin(1992),A2/A2+
...,...,...,...
236,236,Matilda(2022),C1
237,237,Bullet train,B1
238,238,Thor: love and thunder,B2
239,239,Lightyear,B2


In [8]:
# correct some mistakes in movies names
movie_labels.Movie = movie_labels.Movie.str.replace('.srt', '', regex=False)
movie_labels.loc[movie_labels.Movie == 'Up (2009)', 'Movie'] = 'Up(2009)'
movie_labels.loc[movie_labels.Movie == 'The Grinch', 'Movie'] = 'The.Grinch'

# удалим из level лишние символы, разделители оставим пробел
# эти level будем использовать в качестве ключей для словаря с фильмами по уровнямя
movie_labels.Level = movie_labels.Level \
                                 .str.replace(',', '', regex=False) \
                                 .str.replace('+', '', regex=False) \
                                 .str.replace('/', ' ', regex=False)

# разложим фильмы по уровням в словарик 
# фильмы с несколькими уровнями будут дублироваться в разных списках
film_levels = {key:[] for key in ENGLISH_LEVELS}

for level in ENGLISH_LEVELS:
    movie_list = list(movie_labels.loc[movie_labels.Level.str.contains(level), 'Movie'].values)
    film_levels[level] += movie_list

for level in film_levels:
    # на случай возможных неучтенных дубликатор преобразуем лист в сет, там только уники
    film_levels[level] = set(film_levels[level])
    print(f'Категория {level:8} {len(film_levels[level])} фильмов')
    

Категория A1       0 фильмов
Категория A2       37 фильмов
Категория B1       65 фильмов
Категория B2       109 фильмов
Категория C1       40 фильмов
Категория C2       0 фильмов


In [9]:
for key, value in film_levels.items():
    
    for movie in value:
        words = movie.split()
        
        n = movies.loc[movies.filename.str.contains(movie, regex=False)].shape[0]
        
        if n == 0:
            print('не найден текст', key, movie)
            
        elif n == 1:
            selected_movie_level = movies.loc[
                movies.filename.str.contains(movie, regex=False), 'level'].values[0]
            
            if selected_movie_level == 'Subtitles':    # replace Subtitles with excel level
                 movies.loc[
                     movies.filename.str.contains(movie, regex=False), 'level'] = key
                
            elif selected_movie_level != key:          # replace with min current level or excel
                 movies.loc[
                     movies.filename.str.contains(movie, regex=False), 'level'
                 ] = min(selected_movie_level, key)

        else:
            print('не единственный текст', key, movie)
            
movies

не найден текст B1 Bullet train
не найден текст B2 Thor: love and thunder
не найден текст B2 The Secret Life of Pets.en
не найден текст B2 Lightyear
не найден текст B2 Glass Onion
не найден текст C1 Suits S04E10 EngSub
не найден текст C1 Suits S04E09 EngSub
не найден текст C1 Suits S04E15 EngSub
не найден текст C1 Suits S04E08 EngSub
не найден текст C1 Suits S04E13 EngSub
не найден текст C1 Suits S04E12 EngSub
не найден текст C1 Suits S04E14 EngSub
не найден текст C1 Suits S04E03 EngSub
не найден текст C1 Suits S04E05 EngSub
не найден текст C1 Suits S04E01 EngSub
не найден текст C1 Suits S04E16 EngSub
не найден текст C1 Suits S04E11 EngSub
не найден текст C1 Suits S04E02 EngSub
не найден текст C1 Suits S04E07 EngSub
не найден текст C1 Suits S04E04 EngSub
не найден текст C1 Matilda(2022)
не найден текст C1 Suits S04E06 EngSub


Unnamed: 0,movie,filename,content,level,status
0,,"Crown, The S01E01 - Wolferton Splash.en",seeking his british nationalization his royal ...,B2,
1,,Suits.Episode 1- Denial,you the most amazing woman have ever met are y...,B2,
2,,Crazy4TV.com - Suits.S06E06.720p.BluRay.x265.H...,been after sutter for three years now this guy...,B2,
3,,Suits.S02E08.HDTV.x264-EVOLVE,you late nope seconds early good learning tell...,B2,
4,,Virgin.River.S01E07.INTERNAL.720p.WEB.x264-STRiFE,are you sure can convince you stay you were ri...,B2,
...,...,...,...,...,...
257,,Matilda(1996),font color font font color font everyone born ...,B1,
258,,Her(2013),advertise your product brand here contact www ...,A2,
259,,The_Fundamentals_of_Caring(2016),caregiving not just about feeding and clothing...,B1,
260,,The_Intern(2015),freud said love and work work and love that al...,B2,


In [10]:
# stopwords = nlp.Defaults.stop_words


In [11]:
# stopwords

In [12]:
def lemmatize(text):
    lemmatized = []
    doc = nlp(text)
    for token in doc:
        if not token.is_stop and token.pos_ != 'PROPN':
            lemmatized.append(token.lemma_)
    return ' '.join(lemmatized)


In [13]:
%%time
movies['lemmas'] = movies.content.apply(lemmatize)

CPU times: user 6min 19s, sys: 9.5 s, total: 6min 28s
Wall time: 6min 29s


In [17]:
movies['oxford_intersection']

Unnamed: 0,movie,filename,content,level,status,lemmas
161,,Gogo_Loves_English,wow hello hello name tony what your name name ...,Subtitles,,wow hello hello tony tony hello hello jenny he...
171,,Westworld_scenes_of_Dr_Robert_Ford,one complained there the lady with the white s...,Subtitles,,complain lady white shoe money life stop plan ...
180,,Breaking_Bad_The_Movie(2017),what good meet you all nice meet you sam worki...,Subtitles,,good meet nice meet work talk giant space lase...
209,,The_Ghost_Writer,hmm this angela please leave message the beep ...,Subtitles,,hmm leave message beep rise shine sweetheart k...
215,,Harry_Potter_and_the_philosophers_stone(2001),should known that you would here professor mcg...,Subtitles,,known good evening professor dumbledore rumor ...
227,,Casper,okay one picture history this what afraid you ...,Subtitles,,okay picture history afraid tell thing picture...
233,,Pride_and_Prejudice,lydia kitty dear bennett have you heard never ...,Subtitles,,hear field park let want know take wish tell d...
240,,BrenВ.Brown.The.Call.to.Courage.2019.720.NF.72...,she spent years studying courage vulnerability...,Subtitles,,spend year study courage vulnerability shame e...


In [14]:
# movies.iloc[:1].content.values

In [15]:
# movies.iloc[:1].content.apply(lemmatize).values

In [29]:
df_oxford_levels[df_oxford_levels.level <= 'A2']

Unnamed: 0,word,level
0,a,A1
1,about,A1
2,above,A1
3,across,A1
4,action,A1
...,...,...
893,worst,A2
894,wow,A2
895,yet,A2
896,yours,A2


In [30]:
df_oxford_levels.word.nunique()

4885

In [31]:
df_oxford_levels.shape

(5322, 2)

In [39]:
df_oxford_levels[df_oxford_levels.word.duplicated(keep=False)].sort_values(by='word')

Unnamed: 0,word,level
2,abroad,A2
1,abroad,B2
1,academic,B1
5,academic,B2
3,accommodation,B1
...,...,...
907,yard,A1
895,yet,A2
1480,yet,B2
825,young,B1
