# Data Exploration

In [5]:
import pandas as pd
import numpy as np

In [6]:
def show_na(df):
    na_columns = df.isna().any(axis = 0)
    na_rows = df.isna().any(axis = 1)
    columns_nbna = df.isna().sum(axis = 0)
    print("Empty columns", na_columns.sum(), "of ", len(df.columns), "%:", na_columns.sum()/len(df.columns), "%", "\n", na_columns)
    print("Empty rows", na_rows.sum(), "of", len(df), "%:",  f"{(na_rows.sum()/len(df)):.5f}", "%")

# ratings.csv

In [7]:
df_ratings = pd.read_csv("../data/ratings.csv")
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 610.4 MB


In [8]:
df_ratings["timestamp"].describe().apply("{0:.5f}".format)

count      20000263.00000
mean     1100917921.67712
std       162169424.78273
min       789652004.00000
25%       966797745.00000
50%      1103555886.00000
75%      1225642317.50000
max      1427784002.00000
Name: timestamp, dtype: object

In [9]:
df_ratings["rating"].describe().apply("{0:.5f}".format)

count    20000263.00000
mean            3.52553
std             1.05199
min             0.50000
25%             3.00000
50%             3.50000
75%             4.00000
max             5.00000
Name: rating, dtype: object

In [10]:
show_na(df_ratings)

Empty columns 0 of  4 %: 0.0 % 
 userId       False
movieId      False
rating       False
timestamp    False
dtype: bool
Empty rows 0 of 20000263 %: 0.00000 %


# movies.csv

In [11]:
df_movies = pd.read_csv("../data/movies.csv")

all_genres = df_movies["genres"].str.split(
    '|', expand=True).stack().reset_index(drop=True)

all_genres.unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'Documentary', 'War', 'Musical',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [12]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [13]:
show_na(df_movies)

Empty columns 0 of  3 %: 0.0 % 
 movieId    False
title      False
genres     False
dtype: bool
Empty rows 0 of 27278 %: 0.00000 %


# tags.csv

In [14]:
df_tags = pd.read_csv("../data/tags.csv")
df_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465564 entries, 0 to 465563
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   userId     465564 non-null  int64 
 1   movieId    465564 non-null  int64 
 2   tag        465548 non-null  object
 3   timestamp  465564 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 14.2+ MB


In [15]:
len(df_tags["tag"].unique())

38644

In [16]:
df_tags["timestamp"].describe().apply("{0:.5f}".format)

count        465564.00000
mean     1298711076.34286
std        79208912.59511
min      1135429210.00000
25%      1245007262.50000
50%      1302291181.00000
75%      1366217861.25000
max      1427771352.00000
Name: timestamp, dtype: object

In [17]:
show_na(df_tags)

Empty columns 1 of  4 %: 0.25 % 
 userId       False
movieId      False
tag           True
timestamp    False
dtype: bool
Empty rows 16 of 465564 %: 0.00003 %


# links.csv

In [18]:
df_links = pd.read_csv("../data/links.csv")

In [19]:
df_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  27278 non-null  int64  
 1   imdbId   27278 non-null  int64  
 2   tmdbId   27026 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 639.5 KB


In [20]:
show_na(df_links)

Empty columns 1 of  3 %: 0.3333333333333333 % 
 movieId    False
imdbId     False
tmdbId      True
dtype: bool
Empty rows 252 of 27278 %: 0.00924 %


# genome-tags.csv

In [21]:
df_genometags = pd.read_csv("../data/genome-tags.csv")

In [22]:
df_genometags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagId   1128 non-null   int64 
 1   tag     1128 non-null   object
dtypes: int64(1), object(1)
memory usage: 17.8+ KB


In [23]:
show_na(df_genometags)

Empty columns 0 of  2 %: 0.0 % 
 tagId    False
tag      False
dtype: bool
Empty rows 0 of 1128 %: 0.00000 %


# genome-scores.csv

In [24]:
df_genome_scores = pd.read_csv("../data/genome-scores.csv")
df_genome_scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11709768 entries, 0 to 11709767
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   tagId      int64  
 2   relevance  float64
dtypes: float64(1), int64(2)
memory usage: 268.0 MB


In [25]:
show_na(df_genome_scores)

Empty columns 0 of  3 %: 0.0 % 
 movieId      False
tagId        False
relevance    False
dtype: bool
Empty rows 0 of 11709768 %: 0.00000 %


In [26]:
df_genome_scores["relevance"].describe().apply("{0:.5f}".format)

count    11709768.00000
mean            0.11648
std             0.15425
min             0.00025
25%             0.02425
50%             0.05650
75%             0.14150
max             1.00000
Name: relevance, dtype: object