In [2]:
from movielens_analysis import Links
import itertools

with open("data/links.csv", "r", encoding="utf-8") as f:
    header = next(f)
    links_first_1000 = list(itertools.islice(f, 1000))

with open("data/links_subset.csv", "w", encoding="utf-8") as f:
    f.write(header)
    f.writelines(links_first_1000)

links = Links("data/links_subset.csv")

rows, cols = len(links.links), len(links.links[0])
print(f"Shape 'links_subset.csv': ({rows}, {cols})")

Shape 'links_subset.csv': (1000, 3)


In [3]:
movie_ids=[1, 2, 3, 4, 5]
fields=["Title", "Year", "Runtime", "imdbRating"]
imdb_data = links.get_imdb(movie_ids, fields)
imdb_data

[[5, 'Father of the Bride Part II', '1995', 106, 6.1],
 [4, 'Waiting to Exhale', '1995', 124, 6],
 [3, 'Grumpier Old Men', '1995', 101, 6.7],
 [2, 'Jumanji', '1995', 104, 7.1],
 [1, 'Toy Story', '1995', 81, 8.3]]

In [4]:
%timeit links.get_imdb(movie_ids, fields)

6.12 s ± 568 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
for row in imdb_data:
    print(row)

[5, 'Father of the Bride Part II', '1995', 106, 6.1]
[4, 'Waiting to Exhale', '1995', 124, 6]
[3, 'Grumpier Old Men', '1995', 101, 6.7]
[2, 'Jumanji', '1995', 104, 7.1]
[1, 'Toy Story', '1995', 81, 8.3]


In [8]:
limit=10
top_directors_data = links.top_directors(n=10, limit=limit)

In [9]:
%timeit -r 2 links.top_directors(n=10, limit=limit)

13.3 s ± 1.54 s per loop (mean ± std. dev. of 2 runs, 1 loop each)


In [None]:
for name, count in top_directors_data.items():
    print(f"{name}: {count}")

Martin Scorsese: 2
Robert Rodriguez: 2
John Lasseter: 1
Joe Johnston: 1
Howard Deutch: 1
Forest Whitaker: 1
Charles Shyer: 1
Michael Mann: 1
Sydney Pollack: 1
Peter Hewitt: 1


In [None]:
limit=100
most_expensive_data = links.most_expensive(n=10, limit=limit)

In [None]:
%timeit -r 2 links.most_expensive(n=10, limit=limit)

2min 57s ± 22.7 s per loop (mean ± std. dev. of 2 runs, 1 loop each)


In [None]:
for title, budget in most_expensive_data.items():
    print(f"{title}: ${budget:,}")

Cutthroat Island: $98,000,000
Braveheart: $72,000,000
Money Train: $68,000,000
Jumanji: $65,000,000
The American President: $62,000,000
Heat: $60,000,000
GoldenEye: $60,000,000
Sabrina: $58,000,000
Pocahontas: $55,000,000
Casino: $52,000,000


In [None]:
limit=100
most_profitable_data = links.most_profitable(n=10, limit=limit)

In [None]:
%timeit -r 2 links.most_profitable(n=10, limit=limit)

2min 26s ± 457 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)


In [None]:
for title, profit in most_profitable_data.items():
    print(f"{title}: ${profit:,}")

Toy Story: $364,436,586
Se7en: $295,981,827
GoldenEye: $292,194,034
Pocahontas: $291,079,773
Babe: $224,134,910
Jumanji: $197,821,940
Ace Ventura: When Nature Calls: $182,385,533
The Bridges of Madison County: $158,016,617
Keiner liebt mich: $158,016,617
Dangerous Minds: $156,519,401


In [None]:
limit=100
longest_data = links.longest(n=10, limit=limit)

In [None]:
%timeit -r 2 links.longest(n=10, limit=limit)

Uvays Bakoev
3min 1s ± 22.9 s per loop (mean ± std. dev. of 2 runs, 1 loop each)


In [None]:
for title, runtime in longest_data.items():
    print(f"{title}: {runtime}")

Nixon: 192
Casino: 178
Braveheart: 178
Les misérables: 175
Heat: 170
Mr. Holland&apos;s Opus: 144
Sense and Sensibility: 136
The Bridges of Madison County: 135
Assassins: 133
GoldenEye: 130


In [None]:
limit = 100
top_cost_per_minute_data = links.top_cost_per_minute(n=10, limit=limit)

In [None]:
%timeit -r 2 links.top_cost_per_minute(n=10, limit=limit)

2min 40s ± 5.24 s per loop (mean ± std. dev. of 2 runs, 1 loop each)


In [None]:
for title, cost in top_cost_per_minute_data.items():
    print(f"{title}: ${cost:,.2f}")

Cutthroat Island: $790,322.58
Pocahontas: $679,012.35
Jumanji: $625,000.00
Money Train: $618,181.82
Fair Game: $549,450.55
The American President: $543,859.65
The Indian in the Cupboard: $468,750.00
Broken Arrow: $462,962.96
GoldenEye: $461,538.46
Sabrina: $456,692.91


In [None]:
from movielens_analysis import Movies
import itertools

with open("data/movies.csv", "r", encoding="utf-8") as f:
    header = next(f)
    movies_first_1000 = list(itertools.islice(f, 1000))

with open("data/movies_subset.csv", "w", encoding="utf-8") as f:
    f.write(header)
    f.writelines(movies_first_1000)

movies = Movies("data/movies_subset.csv")

rows, cols = len(movies.movies), len(movies.movies[0])
print(f"Shape 'movies_subset.csv': ({rows}, {cols})")

Shape 'movies_subset.csv': (1000, 3)


In [None]:
dist_by_release_data = movies.dist_by_release()

In [None]:
%timeit movies.dist_by_release()

705 μs ± 41.3 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
for key, value in list(dist_by_release_data.items())[:10]:
    print(f"{key}: {value}")

1995: 224
1994: 184
1996: 181
1993: 101
1992: 23
1990: 15
1991: 15
1989: 14
1986: 9
1982: 8


In [None]:
dist_by_genres_data = movies.dist_by_genres()

In [None]:
%timeit movies.dist_by_genres()

561 μs ± 5.47 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
for key, value in list(dist_by_genres_data.items())[:10]:
    print(f"{key}: {value}")

Drama: 507
Comedy: 365
Romance: 208
Thriller: 179
Action: 158
Adventure: 126
Crime: 122
Children: 100
Fantasy: 69
Sci-Fi: 69


In [None]:
title_length_distribution = movies.dist_by_title_length()

In [None]:
%timeit movies.dist_by_title_length()

380 μs ± 6.57 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
for length, count in list(title_length_distribution.items())[:10]:
    print(f"Title Length: {length} - Count: {count}")

Title Length: 19 - Count: 62
Title Length: 23 - Count: 55
Title Length: 17 - Count: 54
Title Length: 21 - Count: 50
Title Length: 16 - Count: 49
Title Length: 18 - Count: 44
Title Length: 20 - Count: 44
Title Length: 24 - Count: 43
Title Length: 25 - Count: 43
Title Length: 14 - Count: 40


In [None]:
from movielens_analysis import Ratings
import itertools

with open("data/ratings.csv", "r", encoding="utf-8") as f:
    header = next(f)
    ratings_first_1000 = list(itertools.islice(f, 1000))

with open("data/ratings_subset.csv", "w", encoding="utf-8") as f:
    f.write(header)
    f.writelines(ratings_first_1000)

ratings = Ratings("ratings_subset.csv")

rows, cols = len(ratings.data), len(ratings.data[0])
print(f"Shape 'ratings_subset.csv': ({rows}, {cols})")

Shape 'ratings_subset.csv': (1000, 4)


In [None]:
target_movie_ids = [15]

ratings = Ratings("ratings.csv")
movies = Ratings.Movies(ratings, "data/movies.csv")

for entry in ratings.data:
    if entry['movieId'] in target_movie_ids:
        movie_id = entry['movieId']
        print(movies.movie_titles.get(movie_id, "Название не найдено"))


Cutthroat Island (1995)
Cutthroat Island (1995)
Cutthroat Island (1995)
Cutthroat Island (1995)
Cutthroat Island (1995)
Cutthroat Island (1995)
Cutthroat Island (1995)
Cutthroat Island (1995)
Cutthroat Island (1995)
Cutthroat Island (1995)
Cutthroat Island (1995)
Cutthroat Island (1995)
Cutthroat Island (1995)


In [None]:
mean = movies.mean
median = movies.median

In [None]:
%timeit movies.load_movie_titles("data/movies.csv")

4.53 ms ± 189 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
values = [2,3,4,5]
mean_test = mean(values)
print(mean_test)

3.5


In [None]:
%timeit movies.mean(values)

117 ns ± 4.85 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [None]:
median_test = median(values)
print(median_test)

3.5


In [None]:
print(movies.dist_by_year())

{1996: 6040, 1997: 1916, 1998: 507, 1999: 2439, 2000: 10061, 2001: 3922, 2002: 3478, 2003: 4014, 2004: 3279, 2005: 5813, 2006: 4059, 2007: 7111, 2008: 4348, 2009: 4163, 2010: 2301, 2011: 1690, 2012: 4657, 2013: 1664, 2014: 1439, 2015: 6616, 2016: 6702, 2017: 8199, 2018: 6418}


In [None]:
%timeit movies.dist_by_year

33 ns ± 0.391 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [None]:
movies.dist_by_rating()

{0.5: 1370,
 1.0: 2811,
 1.5: 1791,
 2.0: 7551,
 2.5: 5550,
 3.0: 20047,
 3.5: 13136,
 4.0: 26818,
 4.5: 8551,
 5.0: 13211}

In [None]:
%timeit movies.dist_by_rating

31.4 ns ± 0.264 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [None]:
movies.top_by_num_of_ratings(10)

{'Forrest Gump (1994)': 329,
 '"Shawshank Redemption': 317,
 'Pulp Fiction (1994)': 307,
 '"Silence of the Lambs': 279,
 '"Matrix': 278,
 'Star Wars: Episode IV - A New Hope (1977)': 251,
 'Jurassic Park (1993)': 238,
 'Braveheart (1995)': 237,
 'Terminator 2: Judgment Day (1991)': 224,
 "Schindler's List (1993)": 220}

In [None]:
%timeit movies.top_by_num_of_ratings(10)

13.6 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
movies.top_by_ratings(10,mean)

{'The Jinx: The Life and Deaths of Robert Durst (2015)': 5.0,
 'Galaxy of Terror (Quest) (1981)': 5.0,
 'Alien Contamination (1980)': 5.0,
 "I'm the One That I Want (2000)": 5.0,
 'Lesson Faust (1994)': 5.0,
 '"Assignment': 5.0,
 'Mephisto (1981)': 5.0,
 'Black Mirror': 5.0,
 'Dylan Moran: Monster (2004)': 5.0,
 'Bill Hicks: Revelations (1993)': 5.0}

In [None]:
%timeit movies.top_by_ratings(1000,mean)

22.2 ms ± 662 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
movies.top_controversial(10)

{"Ivan's Childhood (a.k.a. My Name is Ivan) (Ivanovo detstvo) (1962)": 5.06,
 'Fanny and Alexander (Fanny och Alexander) (1982)': 5.06,
 'Troll 2 (1990)': 4.5,
 'Lassie (1994)': 4.0,
 '"Zed & Two Noughts': 4.0,
 'Kwaidan (Kaidan) (1964)': 4.0,
 'Emma (2009)': 4.0,
 'Play Time (a.k.a. Playtime) (1967)': 3.72,
 '"Room': 3.56,
 'Peeping Tom (1960)': 3.5}

In [None]:
%timeit movies.top_controversial(10)

32 ms ± 979 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
users = Ratings.Users(ratings)
print(users.dist_by_num_of_ratings())

{20: 14, 21: 15, 22: 14, 23: 13, 24: 7, 25: 9, 26: 13, 27: 8, 28: 8, 29: 8, 30: 3, 31: 7, 32: 7, 33: 10, 34: 9, 35: 11, 36: 9, 37: 6, 38: 7, 39: 4, 40: 6, 41: 4, 42: 5, 43: 3, 44: 3, 45: 6, 46: 5, 47: 4, 48: 7, 50: 7, 51: 5, 52: 2, 53: 4, 54: 3, 55: 2, 56: 14, 57: 4, 58: 5, 59: 3, 60: 2, 61: 5, 62: 2, 63: 1, 64: 5, 65: 4, 66: 2, 67: 2, 68: 1, 69: 5, 70: 2, 71: 1, 72: 2, 73: 2, 74: 2, 75: 3, 76: 3, 77: 3, 78: 3, 80: 1, 81: 2, 82: 2, 83: 3, 84: 3, 85: 1, 86: 2, 87: 3, 88: 2, 89: 2, 90: 1, 92: 1, 93: 4, 94: 4, 95: 1, 97: 3, 98: 3, 100: 3, 101: 1, 102: 2, 103: 3, 105: 1, 106: 2, 107: 2, 108: 1, 109: 1, 110: 2, 111: 2, 112: 4, 113: 3, 114: 2, 115: 2, 118: 2, 119: 2, 120: 1, 121: 2, 122: 1, 123: 2, 126: 1, 127: 3, 128: 4, 129: 2, 130: 1, 131: 2, 133: 1, 134: 1, 135: 3, 138: 3, 139: 1, 140: 4, 141: 4, 148: 1, 150: 3, 152: 4, 154: 1, 155: 3, 156: 1, 157: 1, 161: 1, 163: 1, 164: 2, 165: 2, 167: 3, 168: 3, 173: 1, 174: 1, 176: 1, 177: 1, 179: 1, 180: 1, 181: 1, 185: 1, 186: 1, 187: 2, 188: 1, 18

In [None]:
%timeit users.dist_by_num_of_ratings()

80.2 μs ± 2.86 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
users.top_controversial(10)

{3: 4.26,
 461: 3.1,
 55: 3.09,
 259: 2.94,
 329: 2.92,
 502: 2.76,
 175: 2.75,
 598: 2.7,
 393: 2.61,
 138: 2.44}

In [None]:
%timeit users.top_controversial(10)

9.03 ms ± 119 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
users.dist_by_rating(median)

{0.5: 1,
 1.0: 1,
 2.0: 8,
 2.5: 7,
 2.75: 3,
 3.0: 111,
 3.25: 6,
 3.5: 94,
 3.75: 5,
 4.0: 298,
 4.25: 2,
 4.5: 40,
 4.75: 1,
 5.0: 33}

In [None]:
%timeit users.dist_by_rating(median)

6.54 ms ± 570 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
from movielens_analysis import Tags
import itertools

with open("data/tags.csv", "r", encoding="utf-8") as f:
    header = next(f)
    tags_first_1000 = list(itertools.islice(f, 1000))

with open("data/tags_subset.csv", "w", encoding="utf-8") as f:
    f.write(header)
    f.writelines(tags_first_1000)

tags = Tags("tags_subset.csv")

rows, cols = len(tags.data), len(tags.data[0])
print(f"Shape 'tags_subset.csv': ({rows}, {cols})")

Shape 'tags_subset.csv': (1000, 4)


In [None]:
tags = Tags("tags.csv")
tags.most_words(10)

{'Something for everyone in this one... saw it without and plan on seeing it with kids!': 16,
 'the catholic church is the most corrupt organization in history': 10,
 'villain nonexistent or not needed for good story': 8,
 'It was melodramatic and kind of dumb': 7,
 'stop using useless characters for filler': 6,
 'r:disturbing violent content including rape': 6,
 'Oscar (Best Music - Original Score)': 5,
 'r:sustained strong stylized violence': 5,
 'Oscar (Best Effects - Visual Effects)': 5,
 'start of a beautiful friendship': 5}

In [None]:
%timeit tags.most_words(10)

2.09 ms ± 24.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
tags.longest(10)

['Something for everyone in this one... saw it without and plan on seeing it with kids!',
 'the catholic church is the most corrupt organization in history',
 'villain nonexistent or not needed for good story',
 'r:disturbing violent content including rape',
 '06 Oscar Nominated Best Movie - Animation',
 'stop using useless characters for filler',
 'Academy award (Best Supporting Actress)',
 'Oscar (Best Effects - Visual Effects)',
 'r:sustained strong stylized violence',
 'It was melodramatic and kind of dumb']

In [None]:
%timeit tags.longest(10)

362 μs ± 9.34 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
tags.most_popular(10)

{'In Netflix queue': 131,
 'atmospheric': 36,
 'superhero': 24,
 'thought-provoking': 24,
 'funny': 23,
 'Disney': 23,
 'surreal': 23,
 'religion': 22,
 'sci-fi': 21,
 'dark comedy': 21}

In [None]:
%timeit tags.most_popular(10)

576 μs ± 46.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
tags.tags_with('black')

['black and white',
 'black comedy',
 'black hole',
 'black humor',
 'black humour',
 'black-and-white']

In [None]:
%timeit tags.tags_with('black')

259 μs ± 8.69 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
