# Этап 1. Парсинг данных

В этом разделе собираем данные для дальнейшего анализа.

[Источник](https://www.imdb.com/search/title/?title_type=feature)

In [3]:
!pip install requests
!pip install beautifulsoup4`

import requests
from bs4 import BeautifulSoup
import pandas as pd

zsh:1: unmatched `


Создадим функцию, которая будет собирать необходиимые данные с 1 страницы (на странице 50 фильмов).

In [2]:
def parse_imdb_page():
    while True:
        url = yield
        response = requests.get(url=url, headers={'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7'})
        if not response.ok:
            return None

        soup = BeautifulSoup(response.text, 'html.parser')
        all_films = soup.find_all('div', class_='lister-item mode-advanced')
        # 'div' - тег блока, под которым у нас вся инфа о фильме. class_ - класс блока

        parsed_films_list = []

        for film in all_films:
            if (film_content := film.find('div', class_='lister-item-content')) is not None:
                film_header = film_content.find('h3', class_='lister-item-header')
                film_name = film_header.find('a').text.strip()
                film_url = film_header.find('a')['href']
                film_global_rate = film_header.find('span', class_='lister-item-index unbold text-primary').text.strip()
                film_year = film_header.find('span', class_='lister-item-year text-muted unbold').text.strip()
                film_rates = film_content.find('div', class_='ratings-bar')

                try: # Не у всех фильмов есть отметки на meta score & imdb. Поэтому пробуем их найти
                    film_imdb_rate = film_rates.find('div', class_='inline-block ratings-imdb-rating')['data-value'] # rate по imdb
                    film_meta_score_rate = film_rates.find('div', class_='inline-block ratings-metascore').find('span').text.strip() # rate на meta score
                except:
                    film_imdb_rate = None
                    film_meta_score_rate = None

                film_muted_spans = film_content.find_all('p', class_='text-muted')

                for film_desc in film_muted_spans:
                    genre = film_desc.find('span', class_='genre')
                    duration = film_desc.find('span', class_='runtime')
                    if genre or duration:
                        film_duration = None if not duration else duration.text.strip()
                        if genre is not None:
                            film_genre = genre.text.strip()
                        else:
                            film_genre = None
                    else:
                        film_description = film_desc.text.strip()

                try: # Не у всех фильмов есть кол-во голосов, так что пробуем их найти
                    film_votes = film_content.find('p', class_='sort-num_votes-visible').find(
                        'span',
                        attrs={'name': 'nv'}
                    )['data-value']
                except:
                    film_votes = None

                parsed_films_list.append({
                    'name': film_name,
                    'url': film_url,
                    'global_rate': film_global_rate,
                    'release_year': film_year,
                    'imdb_rate': film_imdb_rate,
                    'metascore_rate': film_meta_score_rate,
                    'description': film_description,
                    'votes': film_votes,
                    'duration': film_duration,
                    'genre': film_genre
                })

        yield parsed_films_list


Соберем функцию, чтобы мы могли спокойно переключаться между страницами и собрать большую базу.

In [12]:
def get_result(max_count: int = 5000):
    cour = parse_imdb_page()
    next(cour)
    info = cour.send('https://www.imdb.com/search/title/?title_type=feature')
    for count in range(51, max_count, 50):
        next_url = f'https://www.imdb.com/search/title/?title_type=feature&start={count}&ref_=adv_nxt'
        next(cour)
        info += cour.send(next_url)
    else:
        cour.close()
    return info
result = get_result()


Соберем наши данные в файл:

In [13]:

df = pd.DataFrame(result, columns=list(result[0].keys()))
df.to_csv('list.csv', index=False)

Данные лежат в файле list.csv, уберем в табличке ссылки на фильмы и будет готово!

In [15]:
df_inf = pd.read_csv('/Users/fatimaalburina/Desktop/ /Андан/Andan-project-/list.csv', sep=',')
df_inf = df_inf.sort_values(by='imdb_rate', ascending=False)
df_inf

Unnamed: 0,name,url,global_rate,release_year,imdb_rate,metascore_rate,description,votes,duration,genre
15,Убийцы цветочной луны,/title/tt5537002/,16.,(2023),94,91.0,Members of the Osage tribe in the United State...,1290.0,206 min,"Crime, Drama, History"
58,Побег из Шоушенка,/title/tt0111161/,59.,(1994),93,82.0,"Over the course of several years, two convicts...",2751362.0,142 min,Drama
48,Крестный отец,/title/tt0068646/,49.,(1972),92,100.0,"Don Vito Corleone, head of a mafia family, dec...",1913852.0,175 min,"Crime, Drama"
0,Человек-паук: Паутина вселенных,/title/tt9362722/,1.,(2023),9,86.0,"Miles Morales catapults across the Multiverse,...",112055.0,140 min,"Animation, Action, Adventure"
193,Список Шиндлера,/title/tt0108052/,194.,(1993),9,95.0,"In German-occupied Poland during World War II,...",1387425.0,195 min,"Biography, Drama, History"
...,...,...,...,...,...,...,...,...,...,...
4981,Из глубины,/title/tt16253418/,4 982.,(2023),,,Characters from very different backgrounds are...,,,"Action, Adventure, Drama"
4985,Я Кристина,/title/tt0082176/,4 986.,(1981),,,A teen girl in 1970s Berlin becomes addicted t...,27180.0,138 min,"Biography, Drama"
4987,Наложница,/title/tt2544120/,4 988.,(2012),,,"A tragic love triangle story between Hwa-Yeon,...",2400.0,122 min,"Drama, History, Romance"
4989,Взаперти,/title/tt10131024/,4 990.,(I) (2022),,,When a young mother is barricaded inside a pan...,9832.0,89 min,Thriller
