# Visualization Test Playground

Playground for movie data exploration 

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import pandas as pd
import os

In [3]:
# Loads data (both ratings and movies) into pandas dataframes
def load_data():

    data = pd.read_csv(
        os.path.join('data', 'data.txt'), 
        sep='\t', 
        names=['user_id', 'movie_id', 'rating']
    )

    movies = pd.read_csv(
        os.path.join('data', 'movies.txt'),
        sep='\t',
        names=['movie_id', 'movie_title', 'unknown', 'action', 'adventure',
        'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama',
        'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance',
        'sci-fi', 'thriller', 'war', 'western'],
        encoding='latin1' # fixes bug in reading movies.txt
    )

    print('Data entries:', len(data))
    print('Movie entries:', len(movies))
    
    return data, movies


# Plot and save histogram
#   - dscr : title of plot
#   - filename : if save is True, plot will be saved as plots/filename.png
#   - save : if False, plot will pop-up but not be saved
def plot_histogram(d, dscr, filename=None, save=False):
    
    plt.hist(d, bins=[0.5, 1.5, 2.5, 3.5, 4.5, 5.5], rwidth=0.8)
    plt.title(dscr)
    plt.xlabel('Rating')
    
    if save:
        plt.savefig(os.path.join('plots', filename + '.png'))
    else:
        plt.show()


In [4]:
data, movies = load_data()

Data entries: 100000
Movie entries: 1682


In [4]:
data.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [5]:
movies.head()

Unnamed: 0,movie_id,movie_title,unknown,action,adventure,animation,childrens,comedy,crime,documentary,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
data.head()

Unnamed: 0,user_id,movie_id,rating
43465,619,313,5
1816,303,176,5
48549,7,141,5
48531,309,258,5
1814,49,547,5


In [5]:
movies['num_ratings'] = 0
movies['avg_rating'] = 0
movies.head()

Unnamed: 0,movie_id,movie_title,unknown,action,adventure,animation,childrens,comedy,crime,documentary,...,horror,musical,mystery,romance,sci-fi,thriller,war,western,num_ratings,avg_rating
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [6]:
for i in range(1, 1682+1):
    
    # Get all of the ratings for that specific movie 
    movie_ratings = data[data['movie_id'] == i]
    ri = movie_ratings['rating'].values
    # Compute the number of ratings and average ratings 
    ri_avg = np.mean(ri)
    ri_num = len(ri)
    
    # Add these new values to our new 
    movies.loc[movies['movie_id'] == i, 'num_ratings']= ri_num
    movies.loc[movies['movie_id'] == i, 'avg_rating']= ri_avg

movies.head()

Unnamed: 0,movie_id,movie_title,unknown,action,adventure,animation,childrens,comedy,crime,documentary,...,horror,musical,mystery,romance,sci-fi,thriller,war,western,num_ratings,avg_rating
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,452,3.878319
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,131,3.206107
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,90,3.033333
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,209,3.550239
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,86,3.302326


In [37]:
movies.sort_values('num_ratings', inplace=True, ascending=False)


movies.head(10)

Unnamed: 0,movie_id,movie_title,unknown,action,adventure,animation,childrens,comedy,crime,documentary,...,horror,musical,mystery,romance,sci-fi,thriller,war,western,num_ratings,avg_rating
49,50,Star Wars (1977),0,1,1,0,0,0,0,0,...,0,0,0,1,1,0,1,0,583,4.358491
257,258,Contact (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,509,3.803536
99,100,Fargo (1996),0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,508,4.155512
180,181,Return of the Jedi (1983),0,1,1,0,0,0,0,0,...,0,0,0,1,1,0,1,0,507,4.00789
293,294,Liar Liar (1997),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,485,3.156701
285,286,"English Patient, The (1996)",0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,481,3.656965
287,288,Scream (1996),0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,478,3.441423
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,452,3.878319
299,300,Air Force One (1997),0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,431,3.63109
120,121,Independence Day (ID4) (1996),0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,429,3.438228


In [57]:
movies.sort_values('num_ratings', inplace=True, ascending=False)
popular_movie_ids = movies[:10]['movie_id'].values
print(popular_movie_ids)
total_pop_ratings = []
for pop_id in popular_movie_ids:
    pop_ratings = data.loc[data['movie_id'] == pop_id, 'rating'].values
    total_pop_ratings.append(pop_ratings)

total_pop_ratings = pd.Series(np.array(total_pop_ratings)) 

plot_histogram(total_pop_ratings, 'Most Populare', filename='4_2',save=True)
total_pop_ratings

[ 50 258 100 181 294 286 288   1 300 121]


0    [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
1    [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
2    [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
3    [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
4    [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
5    [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
6    [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
7    [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
8    [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
9    [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
dtype: object

In [48]:
movies.sort_values('avg_rating', inplace=True, ascending=False)
movies.head(10)

Unnamed: 0,movie_id,movie_title,unknown,action,adventure,animation,childrens,comedy,crime,documentary,...,horror,musical,mystery,romance,sci-fi,thriller,war,western,num_ratings,avg_rating
1652,1653,Entertaining Angels: The Dorothy Day Story (1996),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,5.0
1121,1122,They Made Me a Criminal (1939),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,5.0
1535,1536,Aiqing wansui (1994),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,5.0
1188,1189,Prefontaine (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,5.0
1292,1293,Star Kid (1997),0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,3,5.0
1499,1500,Santa with Muscles (1996),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,2,5.0
1466,1467,"Saint of Fort Washington, The (1993)",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,5.0
813,814,"Great Day in Harlem, A (1994)",0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,5.0
1200,1201,Marlene Dietrich: Shadow and Light (1996),0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,5.0
1598,1599,Someone Else's America (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,5.0


In [55]:
movies.sort_values('avg_rating', inplace=True, ascending=False)
highest_rated_movie_ids = movies[:10]['movie_id'].values
print(highest_rated_movie_ids)
total_high_ratings = []
for high_id in highest_rated_movie_ids:
    high_ratings = data.loc[data['movie_id'] == high_id, 'rating'].values
    total_high_ratings.append(high_ratings)

total_high_ratings = pd.Series(np.array(total_high_ratings)) 

plot_histogram(total_high_ratings, 'Highest Rated', filename='4_3',save=True)
total_high_ratings

[1189 1653 1500 1599 1201 1293 1467 1122  814 1536]


0    [5, 5, 5]
1          [5]
2       [5, 5]
3          [5]
4          [5]
5    [5, 5, 5]
6       [5, 5]
7          [5]
8          [5]
9          [5]
dtype: object

In [13]:
# Now sort ratings by the three genres chosen 
genres = ['action', 'sci-fi', 'childrens']
for genre in genres:
    g_ids = movies.loc[movies[genre] == 1, 'movie_id'].values
    total_g_ratings = []
    for g_id in g_ids:
        g_ratings = data.loc[data['movie_id'] == g_id, 'rating'].values
        total_g_ratings.append(g_ratings)
    total_g_ratings = pd.Series(np.array(total_g_ratings))
    plot_histogram(total_g_ratings, genre + ' ratings', filename='4_4_' + genre,save=True)

NameError: name 'g' is not defined