In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('max_columns', 99)
pd.set_option('max_rows', 20)

In [2]:
movies = pd.read_csv('./datasets/movies.csv')
links = pd.read_csv('./datasets/links.csv')
ratings = pd.read_csv('./datasets/ratings.csv')
tags = pd.read_csv('./datasets/tags.csv')

### Movies and Links

In [3]:
df = pd.merge(movies, links)
df.columns = df.columns.str.lower()
df.rename(columns={'movieid':'movie_id', 'imdbid':'imdb_id', 'tmdbid':'tmdb_id'}, inplace=True)

In [4]:
# dataframe with all movies whose title has an extra space at the end
spaces = df.loc[df['title'].str[-1] == ' ']

# getting rid of that extra space
for i in spaces['title']:
    df.replace(to_replace = {'title': i},
                value = i[:-1],
                inplace = True)

# fixing an individual row that was formatted differently
df.replace(to_replace = {'title':'Death Note: Desu nôto (2006–2007)'},
           value = 'Death Note: Desu nôto (2006)',
           inplace = True)

In [5]:
# creating a new column for release year
release_years = []
for i in df['title']:
    release_years.append(i[-5:-1])
df['release_year'] = release_years

# getting rid of the year from the title column for
# the majority of movies which have it listed there

majority = df.loc[df['title'].str[-1] == ')']
anomalies = df.loc[df['title'].str[-1] != ')']

for i in majority['title']:
    df.replace(to_replace = {'title': i},
                value = i[:-7],
                inplace = True)

# fixing another individual row that is formatted differently
df.replace(to_replace = {'title':'Angst '},
           value = 'Angst',
           inplace = True)

In [6]:
# taking care of the anomalies manually.
# I looked up the release years on IMDB, because figuring out how to code a web scrape on imdb
# would have taken me more time than Dr. Strange has access to.

df['release_year'][6059] = 1993
df['release_year'][9031] = 2018
df['release_year'][9091] = 2015
df['release_year'][9138] = 1980
df['release_year'][9179] = 2016
df['release_year'][9259] = 2016
df['release_year'][9367] = 2016
df['release_year'][9448] = 2016
df['release_year'][9514] = 1980
df['release_year'][9515] = 2017
df['release_year'][9525] = 2017
df['release_year'][9611] = 2011

# changing release_year column to numerical type
df['release_year'] = df['release_year'].astype(int)

df.dtypes;

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

In [7]:
# dropping null values
# df.isnull().sum()
# df['tmdb_id'].loc[df['tmdb_id'].isnull()==True]
df.dropna(inplace=True)

### Ratings

In [8]:
# lowercase, renaming, and dropping timestamp column

ratings.columns = ratings.columns.str.lower()
ratings.rename(columns={'movieid':'movie_id', 'userid':'user_id'}, inplace=True)
ratings.drop(columns = ['timestamp'], inplace=True)

In [9]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [10]:
# new columns for mean, max, min, median, and number of ratings for each movie

avg_ratings = []
for i in df['movie_id']:
    avg_rating = ratings.loc[ratings['movie_id']==i]['rating'].mean()
    avg_ratings.append(avg_rating)
df['avg_rating'] = avg_ratings

max_ratings = []
for i in df['movie_id']:
    max_rating = ratings.loc[ratings['movie_id']==i]['rating'].max()
    max_ratings.append(max_rating)
df['max_rating'] = max_ratings

min_ratings = []
for i in df['movie_id']:
    min_rating = ratings.loc[ratings['movie_id']==i]['rating'].min()
    min_ratings.append(min_rating)
df['min_rating'] = min_ratings

med_ratings = []
for i in df['movie_id']:
    med_rating = ratings.loc[ratings['movie_id']==i]['rating'].median()
    med_ratings.append(med_rating)
df['med_rating'] = med_ratings

numb_ratings = []
for i in df['movie_id']:
    numb_rating = ratings.loc[ratings['movie_id']==i]['rating'].count()
    numb_ratings.append(numb_rating)
df['numb_rating'] = numb_ratings

In [11]:
df.head()

Unnamed: 0,movie_id,title,genres,imdb_id,tmdb_id,release_year,avg_rating,max_rating,min_rating,med_rating,numb_rating
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1995,3.92093,5.0,0.5,4.0,215
1,2,Jumanji,Adventure|Children|Fantasy,113497,8844.0,1995,3.431818,5.0,0.5,3.5,110
2,3,Grumpier Old Men,Comedy|Romance,113228,15602.0,1995,3.259615,5.0,0.5,3.0,52
3,4,Waiting to Exhale,Comedy|Drama|Romance,114885,31357.0,1995,2.357143,3.0,1.0,3.0,7
4,5,Father of the Bride Part II,Comedy,113041,11862.0,1995,3.071429,5.0,0.5,3.0,49


# TO DO next:
look for correlations between:
    -reviewer and genres
    -reviewer and release_year
    -reviewer and avg_rating of movies (are they biased towareds good/bad films? This would help with the recommender system)

In [12]:
df['genres'].value_counts(normalize = True)

Drama                                      0.108075
Comedy                                     0.097082
Comedy|Drama                               0.044689
Comedy|Romance                             0.037292
Drama|Romance                              0.035854
                                             ...   
Adventure|Animation|Comedy|Fantasy|IMAX    0.000103
Horror|Thriller|Western                    0.000103
Drama|Fantasy|Horror|Romance|Thriller      0.000103
Action|Adventure|Sci-Fi|War|IMAX           0.000103
Action|Animation|Crime|Sci-Fi              0.000103
Name: genres, Length: 951, dtype: float64