In [None]:
"""Bringing data to the target form."""

## Приведение данных к целевому виду

In [None]:
# 1


import io
import os

import pandas as pd
import requests
from dotenv import load_dotenv

load_dotenv()

disney_csv_url = os.environ.get("DISNEY_CSV_URL", "")
response = requests.get(disney_csv_url)
disney_production = pd.read_csv(io.BytesIO(response.content))
disney_production.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450 entries, 0 to 1449
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          1450 non-null   object
 1   title         1450 non-null   object
 2   director      977 non-null    object
 3   cast          1260 non-null   object
 4   country       1231 non-null   object
 5   release_year  1450 non-null   int64 
 6   rating        1447 non-null   object
 7   duration      1450 non-null   object
 8   listed_in     1450 non-null   object
 9   description   1450 non-null   object
 10  Date          1447 non-null   object
dtypes: int64(1), object(10)
memory usage: 124.7+ KB


In [None]:
# 2


disney_production["Date"] = pd.to_datetime(disney_production["Date"], errors="coerce")
print(disney_production["Date"].dtype)

datetime64[ns]


In [None]:
# 3


filtered = disney_production.query("'2020-01-01' <= Date < '2021-01-01'")
print(filtered["title"].head(10))

325                                         Burrow
326                        Cosmos: Possible Worlds
327    Disney Gallery / Star Wars: The Mandalorian
328                          Max Keeble's Big Move
329                                           Soul
330                      Arendelle Castle Yule Log
331                       Buried Truth of the Maya
332                    Disney Parks Sunrise Series
333                                Dory's Reef Cam
334                                 Into the Woods
Name: title, dtype: object


In [None]:
# 4


print(list(disney_production.drop(columns=["release_year"])))

['type', 'title', 'director', 'cast', 'country', 'rating', 'duration', 'listed_in', 'description', 'Date']


In [None]:
# 5


renamed_data = disney_production.copy()
renamed_data.columns = pd.Index([col.capitalize() for col in renamed_data.columns])
print(list(renamed_data.columns))

['Type', 'Title', 'Director', 'Cast', 'Country', 'Release_year', 'Rating', 'Duration', 'Listed_in', 'Description', 'Date']


In [None]:
# 6


disney_production["listed_in1"] = disney_production["listed_in"].str.replace("&", ",")
print(disney_production["listed_in1"].tail())

1445        Action-Adventure, Family, Science Fiction
1446                 Action-Adventure, Comedy, Family
1447                      Biographical, Comedy, Drama
1448                     Buddy, Comedy, Coming of Age
1449    Action-Adventure, Animals , Nature, Animation
Name: listed_in1, dtype: object


In [None]:
# 7


omitted_values_count = disney_production.isnull().sum()
print(omitted_values_count)

type              0
title             0
director        473
cast            190
country         219
release_year      0
rating            3
duration          0
listed_in         0
description       0
Date              3
listed_in1        0
dtype: int64


In [None]:
# 8


data_cleaned = disney_production.dropna()
print(data_cleaned.isnull().sum())

type            0
title           0
director        0
cast            0
country         0
release_year    0
rating          0
duration        0
listed_in       0
description     0
Date            0
listed_in1      0
dtype: int64


In [None]:
# 9


omitted_percentage = (disney_production.isnull().sum() / len(disney_production)) * 100
omitted_percentage_rounded = omitted_percentage.round(2)
print(omitted_percentage_rounded)

type             0.00
title            0.00
director        32.62
cast            13.10
country         15.10
release_year     0.00
rating           0.21
duration         0.00
listed_in        0.00
description      0.00
Date             0.21
listed_in1       0.00
dtype: float64


In [None]:
# 10


disney_production["country"] = disney_production["country"].fillna(
    "Country not specified"
)
print(disney_production["country"].head())

0    Country not specified
1    Country not specified
2            United States
3    Country not specified
4    Country not specified
Name: country, dtype: object
