# Initial data exploration

In this notebook I am going to read data, make some initial very basic preprocessing and save it as intermediate.

In [31]:
import pandas as pd
import sys
import os
sys.path.append("../")
from definitions import ROOT_DIR
import numpy as np

## Dataset info

In [58]:
dataset_folder = os.path.join(ROOT_DIR, 'data/raw/ml-100k')
export_folder = os.path.join(ROOT_DIR, 'data/interim')

In [13]:
dataset_info_path = os.path.join(dataset_folder, 'u.info')
with open(dataset_info_path, 'r') as f:
    f_contents = f.read()
print(f_contents)

943 users
1682 items
100000 ratings



# Dataset core

In [8]:
dataset_file = os.path.join(dataset_folder, 'u.data')

dataset = pd.read_csv(dataset_file, sep="\t", header=None)
dataset.columns = ['user_id', 'item_id', 'rating', 'timestamp']

In [9]:
dataset.shape

(100000, 4)

In [10]:
dataset.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [26]:
dataset.isna().sum()

user_id      0
item_id      0
rating       0
timestamp    0
dtype: int64

In [62]:
dataset_interim_path = os.path.join(export_folder, 'data.csv')
dataset.to_csv(dataset_interim_path, index=False)

## Films data

In [65]:
films_path = os.path.join(dataset_folder, 'u.item')
films = pd.read_csv(films_path, sep="|", header=None, encoding='latin-1')
films.columns = [
    "movie_id", "movie title", "release_date", "video_release_date", "IMDb_URL", "unknown", "Action", "Adventure", "Animation",
    "Children's", "Comedy", "Crime", "Documentary",
    "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi",
    "Thriller", "War", "Western"
]

In [66]:
films.shape

(1682, 24)

In [67]:
films.head()

Unnamed: 0,movie_id,movie title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [68]:
films.isna().sum()

movie_id                 0
movie title              0
release_date             1
video_release_date    1682
IMDb_URL                 3
unknown                  0
Action                   0
Adventure                0
Animation                0
Children's               0
Comedy                   0
Crime                    0
Documentary              0
Drama                    0
Fantasy                  0
Film-Noir                0
Horror                   0
Musical                  0
Mystery                  0
Romance                  0
Sci-Fi                   0
Thriller                 0
War                      0
Western                  0
dtype: int64

Video release date is always empty.

In [69]:
films = films.drop('video_release_date', 
                   axis=1, 
                   errors='ignore')

In [70]:
films['IMDb_URL'] = films['IMDb_URL'].fillna('')

In [71]:
films.isna().sum()

movie_id        0
movie title     0
release_date    1
IMDb_URL        0
unknown         0
Action          0
Adventure       0
Animation       0
Children's      0
Comedy          0
Crime           0
Documentary     0
Drama           0
Fantasy         0
Film-Noir       0
Horror          0
Musical         0
Mystery         0
Romance         0
Sci-Fi          0
Thriller        0
War             0
Western         0
dtype: int64

In [72]:
films_interim_path = os.path.join(export_folder, 'films.csv')
films.to_csv(films_interim_path, index=False)

## User demographic data

In [54]:
demographic_path = os.path.join(dataset_folder, 'u.user')
demographic = pd.read_csv(demographic_path, sep="|", header=None)
demographic.columns = [
    "user_id", "age", "gender", "occupation", "zip_code"
]

In [55]:
demographic.shape

(943, 5)

In [56]:
demographic.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [57]:
demographic.isna().sum()

user_id       0
age           0
gender        0
occupation    0
zip_code      0
dtype: int64

In [64]:
demographic_interim_path = os.path.join(export_folder, 'user.csv')
demographic.to_csv(demographic_interim_path, index=False)