In [1]:
# !pip install seaborn

In [2]:
import pandas as pd
import json
import seaborn as sns
import operator

### Подготовка списка фильмов с ключевыми словами

##### Подготовка датафрейма с метаданными фильмов (без ключевых слов)

In [3]:
# загрузка исходного датафрейма с метаданными фильмов, удаление пустых и ложных записей, удаление дупликатов, приведение к типам

df_meta = pd.read_csv('./moviesdataset/movies_metadata.csv',low_memory=False)
df_meta = df_meta.loc[:,['title','id','imdb_id']]


df_meta = df_meta.dropna(subset='imdb_id')
df_meta = df_meta.drop_duplicates(subset='imdb_id')
df_meta = df_meta.drop_duplicates(subset='id')
df_meta = df_meta.drop(df_meta[df_meta['id'].str.contains('-')].index)
df_meta['imdb_id'] = df_meta['imdb_id'].apply(lambda x: x.replace('tt',''))


df_meta = df_meta.astype({'title': 'string', 'id': 'int32','imdb_id': 'int32'})
df_meta = df_meta.set_index('id')

df_meta.dtypes
df_meta.shape

(45416, 2)

##### Подготовка датафрейма с ключевыми словами

In [4]:
# загрузка исходного датафрейма с ключевыми словами, приведение к типам

df_kw = pd.read_csv('./moviesdataset/keywords.csv')
df_kw = df_kw.astype({'id': 'int32','keywords': 'string'})
df_kw = df_kw.drop_duplicates(subset='id')

df_kw = df_kw.set_index('id')
df_kw

Unnamed: 0_level_0,keywords
id,Unnamed: 1_level_1
862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...
439050,"[{'id': 10703, 'name': 'tragic love'}]"
111109,"[{'id': 2679, 'name': 'artist'}, {'id': 14531,..."
67758,[]
227506,[]


In [5]:
# kwlistofdicts = eval(df_kw.keywords.iloc[0])
# kwlistofdicts

In [6]:
# обращение ключевых слов dict -> list для удобства работы, подчёт кол-ва ключевых слов в каждом фильме

def get_kw_list(kwlistofdicts):

    char_arr = [c for c in 'abcdefghijklmnopqrstuvwxyz']
    kw_list = []

    for kwdict in eval(kwlistofdicts):

        kw = ''.join([char for char in kwdict['name'] if char in char_arr])
        
        if kw != '':
            kw_list.append(kw)

    return kw_list

df_kw.loc[:,'keylist'] = df_kw['keywords'].apply(get_kw_list)
df_kw['keywords_len'] = df_kw['keylist'].apply(len)

In [7]:
# удаление строк без ключевых слов

df_kw = df_kw[df_kw['keylist'].apply(len) != 0]

df_kw

Unnamed: 0_level_0,keywords,keylist,keywords_len
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[jealousy, toy, boy, friendship, friends, riva...",9
8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[boardgame, disappearance, basedonchildrensboo...",6
15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[fishing, bestfriend, duringcreditsstinger, ol...",4
31357,"[{'id': 818, 'name': 'based on novel'}, {'id':...","[basedonnovel, interracialrelationship, single...",5
11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[baby, midlifecrisis, confidence, aging, daugh...",9
...,...,...,...
84419,"[{'id': 9748, 'name': 'revenge'}, {'id': 9826,...","[revenge, murder, serialkiller, newyorkcity, s...",7
390959,"[{'id': 224180, 'name': 'blair witch'}]",[blairwitch],1
289923,"[{'id': 616, 'name': 'witch'}, {'id': 2035, 'n...","[witch, mythology, legend, serialkiller, mocku...",5
439050,"[{'id': 10703, 'name': 'tragic love'}]",[tragiclove],1


In [8]:
# подготовка частотного словаря ключевых слов
def get_kw_counts(column):

    kw_series = pd.Series([i for j in column.tolist() for i in j])

    kw_count = kw_series.value_counts()

    counts = pd.DataFrame({'kw':kw_count.index,'count': kw_count})

    counts = counts.drop_duplicates(subset='kw')
    counts = counts.sort_values(by=['count','kw'], ascending=[False, True])

    kw_quantity = len(counts)
    will_deleted = len(counts[counts['count']==1])

    print(f'Ключевых слов, встречающихся однажды: {will_deleted} ({round(will_deleted/kw_quantity*100,3)}%)')

    return counts

counts = get_kw_counts(df_kw['keylist'])
counts

Ключевых слов, встречающихся однажды: 8556 (43.326%)


Unnamed: 0,kw,count
womandirector,womandirector,3039
independentfilm,independentfilm,1914
murder,murder,1285
basedonnovel,basedonnovel,822
musical,musical,726
...,...,...
zombieanimals,zombieanimals,1
zone,zone,1
zorro,zorro,1
zumaia,zumaia,1


In [9]:
def print_kw_stat(column):
    counts = get_kw_counts(column)
    most_popular_count = counts.iloc[0,:]['count']
    most_popular_kw = counts.iloc[0,:]['kw']
    most_rare_count = counts.iloc[-1,:]['count']
    most_rare_kw = counts.iloc[-1,:]['kw']
    print(f'Самое популярное слово: {most_popular_kw}, {most_popular_count} раз')
    print(f'Самое редкое слово: {most_rare_kw}, {most_rare_count} раз')
    print(f'Всего слов: {len(counts)}')

print_kw_stat(df_kw['keylist'])

Ключевых слов, встречающихся однажды: 8556 (43.326%)
Самое популярное слово: womandirector, 3039 раз
Самое редкое слово: zumba, 1 раз
Всего слов: 19748


In [10]:
# удаление тех ключевых слов, которые встречаются только однажды

counts_first = get_kw_counts(df_kw['keylist'])
onlyonemeet_first = counts_first[counts_first['count'] == 1]['kw']

def kw_del_onlyonemeet(keylist):

    kw_list = []

    for kw in keylist:
        if kw in onlyonemeet_first:
            pass
        else:
            kw_list.append(kw)
    
    return kw_list

df = df_kw.copy(deep=True)
df['keylist_nolonely'] = df.loc[:,'keylist'].apply(kw_del_onlyonemeet)

counts_second = get_kw_counts(df['keylist_nolonely'])

Ключевых слов, встречающихся однажды: 8556 (43.326%)
Ключевых слов, встречающихся однажды: 0 (0.0%)


In [11]:
# отбор не более 15 ключевых слов (удалением наиболее редких)

counts = get_kw_counts(df['keylist_nolonely'])
quantity = 15

def get_most_common(keylist):

    kw_list = keylist

    if len(kw_list) > quantity:

        kw_cleaned = {}

        for kw in kw_list:
            kw_cleaned[kw] = counts[counts['kw'] == kw]['count'].values[0]

        most_common = [kw[0] for kw in sorted(kw_cleaned.items(), key=operator.itemgetter(1), reverse=True)[:quantity]]
        
        return most_common

    else:
        return keylist   

df['keylist_fifteen'] = df['keylist_nolonely'].apply(get_most_common)
df['keywords_len_commons'] = df['keylist_fifteen'].apply(len)
df

Ключевых слов, встречающихся однажды: 0 (0.0%)


Unnamed: 0_level_0,keywords,keylist,keywords_len,keylist_nolonely,keylist_fifteen,keywords_len_commons
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[jealousy, toy, boy, friendship, friends, riva...",9,"[jealousy, toy, boy, friendship, friends, riva...","[jealousy, toy, boy, friendship, friends, riva...",9
8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[boardgame, disappearance, basedonchildrensboo...",6,"[boardgame, disappearance, basedonchildrensboo...","[boardgame, disappearance, basedonchildrensboo...",6
15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[fishing, bestfriend, duringcreditsstinger, ol...",4,"[fishing, bestfriend, duringcreditsstinger, ol...","[fishing, bestfriend, duringcreditsstinger, ol...",4
31357,"[{'id': 818, 'name': 'based on novel'}, {'id':...","[basedonnovel, interracialrelationship, single...",5,"[basedonnovel, interracialrelationship, single...","[basedonnovel, interracialrelationship, single...",5
11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[baby, midlifecrisis, confidence, aging, daugh...",9,"[baby, midlifecrisis, confidence, aging, daugh...","[baby, midlifecrisis, confidence, aging, daugh...",9
...,...,...,...,...,...,...
84419,"[{'id': 9748, 'name': 'revenge'}, {'id': 9826,...","[revenge, murder, serialkiller, newyorkcity, s...",7,"[revenge, murder, serialkiller, newyorkcity, s...","[revenge, murder, serialkiller, newyorkcity, s...",7
390959,"[{'id': 224180, 'name': 'blair witch'}]",[blairwitch],1,[blairwitch],[blairwitch],1
289923,"[{'id': 616, 'name': 'witch'}, {'id': 2035, 'n...","[witch, mythology, legend, serialkiller, mocku...",5,"[witch, mythology, legend, serialkiller, mocku...","[witch, mythology, legend, serialkiller, mocku...",5
439050,"[{'id': 10703, 'name': 'tragic love'}]",[tragiclove],1,[tragiclove],[tragiclove],1


In [13]:
# отбор не менее 10 слов 

df = df[df['keywords_len_commons'] >= 10]

In [15]:
# объединение с датасетом с метаданными (для получения названий)
df_joined = df.join(df_meta, on='id')
df_joined.loc[:,['keylist_fifteen','title','imdb_id']]

Unnamed: 0_level_0,keylist_fifteen,title,imdb_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
949,"[murder, suspense, detective, robbery, money, ...",Heat,113277
710,"[cuba, falselyaccused, secretidentity, compute...",GoldenEye,113189
4584,"[bowling, basedonnovel, servant, countrylife, ...",Sense and Sensibility,114388
5,"[hotel, newyearseve, witch, bet, hotelroom, sp...",Four Rooms,113101
8012,"[murder, basedonnovel, violence, drug, gangste...",Get Shorty,113161
...,...,...,...
36886,"[sex, shower, blackmail, virgin, nudity, highs...",Schoolgirl Report Part 4: What Drives Parents ...,69234
104308,"[sex, budapest, monster, underground, professo...",The Spider Labyrinth,95728
43085,"[murder, violence, femalenudity, zombie, infec...",Dead and Deader,770739
19307,"[holiday, nudistcamp, camping, tent, field, va...",Carry On Camping,64133


In [16]:
# df_joined.to_csv('df_to_sub.csv',sep=';')

In [None]:
df_joined = df_joined.set_index('imdb_id')

In [None]:
df_subloaded = pd.read_csv('fil_sub_download_temp_file.csv', sep=';', names=['imdb_id','moviename', 'data','file'])
df_subloaded = df_subloaded.astype({'imdb_id': 'int32','moviename': 'string','data': 'object','file': 'object'})
df_subloaded = df_subloaded.drop_duplicates(subset='imdb_id')
df_subloaded = df_subloaded.drop(df_subloaded[df_subloaded['moviename'] == 'ERROR'].index)
df_subloaded = df_subloaded.set_index('imdb_id')
df_subloaded

In [None]:
df_doned = df_subloaded.join(df_joined, on='imdb_id', how='left')


In [None]:
df_doned = df_doned.loc[:,['keylist_fifteen']]
df_doned.dropna(inplace=True)

In [None]:
df_doned

In [None]:
df_doned.to_csv('df_keylist_fifteen.csv', sep=';')