In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from Netflix.data import Movies

In [3]:
# Chris dataset
df = pd.read_csv('/Users/renatoboemer/code/boemer00/Netflix/raw_data/merged_movies_by_index.csv')

In [9]:
df.shape  # 80% = 8309 |  20% = 2077  

(10386, 34)

In [12]:
df.columns

Index(['avg_review_score', 'n_reviews', 'year', 'title', 'Title', 'Year',
       'Rated', 'Released', 'Runtime', 'Genre', 'Director', 'Writer', 'Actors',
       'Plot', 'Language', 'Country', 'Awards', 'Poster', 'Metascore',
       'imdbRating', 'imdbVotes', 'imdbID', 'Type', 'totalSeasons', 'Response',
       'Internet Movie Database', 'Index_match', 'DVD', 'BoxOffice',
       'Production', 'Website', 'Rotten Tomatoes', 'Metacritic', 'Ratings'],
      dtype='object')

## Data Split

In [11]:
# df = df[:8309]
# df_test = df[8309:]

(8310, 34)

In [None]:
# df_test.to_csv('df_test.csv', index=False)

## Data Wrangling

In [13]:
def data_wrangling(df):
    """ cleaning irrelevant rows and columns """ 
    
    # drop irrelevant columns
    df = df.drop(columns=['title', 'year', 'Awards', 'Poster', 'Metascore', 'DVD',
                          'BoxOffice', 'Internet Movie Database','totalSeasons',
                          'imdbVotes','Website', 'Response', 'Production', 'Metacritic', 'Ratings'])

    ## fill nan and' min', convert to int and replace zero for the mean
    df['Runtime'] = df['Runtime'].fillna(0).apply(lambda x: str(x).replace(',', ''))
    df['Runtime'] = df['Runtime'].apply(lambda x: float(str(x).replace(' min', '')))
    df['Runtime'] = df['Runtime'].replace(0, df['Runtime'].mean())
    
    ## fill nan and remove '%', convert to float and replace zero for the mean
    df['Rotten Tomatoes'] = df['Rotten Tomatoes'].fillna(0) 
    df['Rotten Tomatoes'] = df['Rotten Tomatoes'].apply(lambda x: float(str(x).replace('%', '')))
    df['Rotten Tomatoes'] = df['Rotten Tomatoes'].replace(0, df['Rotten Tomatoes'].mean())
    
    ## replace countries and genre with most frequent values
    freq_country = df[['Country']].value_counts().reset_index()['Country'][0]
    df['Country'] = df['Country'].replace(0, freq_country).replace('United States', freq_country)
    
    freq_genre = df['Genre'].mode()[0]
    df['Genre'] = df['Genre'].replace(np.nan, freq_genre)

    # replace null values with unknown
    df['Actors'] = df['Actors'].replace(np.nan,'unknown')
    df['Director'] = df['Director'].replace(np.nan,'unknown')
    df['Writer'] = df['Writer'].replace(np.nan,'unknown')
    df['Plot'] = df['Plot'].replace(np.nan,'unknown')
    
    ## Language binary (either contains English or not)
    df["Language"] = df[["Language"]].fillna("English")
    def language_binary(x):
        if x.find("English") != -1:
            return "English Available"
        else:
            return "English N/A"
    df["Language_binary"] = df["Language"].map(language_binary)
    
    return df
    

In [14]:
# questions: index_match?
data_wrangling(df)

Unnamed: 0,avg_review_score,n_reviews,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,imdbRating,imdbID,Type,Index_match,Rotten Tomatoes,Language_binary
0,3.749543,547,Dinosaur Planet,2003,,14-Dec-03,50.0,"Documentary, Animation, Family",unknown,unknown,"Christian Slater, Scott Sampson",A four-episode animated series charting the ad...,English,USA,7.7,tt0389605,series,1,48.979203,English Available
1,3.641153,2012,Character,1997,R,27-Mar-98,122.0,"Crime, Drama, Mystery",Mike van Diem,"Ferdinand Bordewijk (short story ""Dreverhaven ...","Jan Decleir, Fedja van Huêt, Betty Schuurman, ...","Jacob Katadreuffe lives mute with his mother, ...","Dutch, English, German, French","Netherlands, Belgium",7.7,tt0119448,movie,3,92.000000,English Available
2,3.084396,1019,Sick,1997,Not Rated,07-Nov-97,90.0,Documentary,Kirby Dick,unknown,"Kathe Burkhart, Kirby Dick, Bob Flanagan, Sher...",Diagnosed with cystic fibrosis from a young ag...,English,USA,7.5,tt0120126,movie,6,91.000000,English Available
3,2.129032,93,8 Man,1992,,,83.0,"Action, Sci-Fi",Yasuhiro Horiuchi,"Kazumasa Hirai (comic-book), Jirô Kuwata (comi...","Kai Shishido, Etsushi Takahashi, Sachiko Ayase...",After Tokyo police officer Yokoda is killed in...,Japanese,Japan,5.4,tt0182668,movie,7,48.979203,English N/A
4,3.417582,546,My Favorite Brunette,1947,Passed,04-Apr-47,87.0,"Comedy, Crime, Mystery, Romance, Thriller",Elliott Nugent,"Edmund Beloin (original screenplay), Jack Rose...","Bob Hope, Dorothy Lamour, Peter Lorre, Lon Cha...",Shortly before his execution on the death row ...,English,USA,6.8,tt0039645,movie,12,75.000000,English Available
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10381,3.411855,1957,Interiors,1978,PG,06-Oct-78,92.0,Drama,Woody Allen,Woody Allen,"Diane Keaton, Geraldine Page, Kristin Griffith",Three sisters find their lives spinning out of...,English,USA,7.4,tt0077742,movie,17763,79.000000,English Available
10382,3.867112,64957,Shakespeare in Love,1998,R,08-Jan-99,123.0,"Comedy, Drama, History, Romance",John Madden,"Marc Norman, Tom Stoppard","Geoffrey Rush, Tom Wilkinson, Steven O'Donnell...","The world's greatest ever playwright, William ...",English,"USA, UK",7.1,tt0138097,movie,17764,92.000000,English Available
10383,2.839207,1362,Epoch,2001,PG-13,24-Nov-01,96.0,"Sci-Fi, Thriller",Matt Codd,"Jonathan Raymond (screenplay), Jonathan Raymon...","David Keith, Stephanie Niznik, Ryan O'Neal, Br...",Disaster strikes as a specialised team of inve...,English,USA,5.0,tt0233657,movie,17768,16.000000,English Available
10384,2.498592,6749,The Company,2003,PG-13,20-May-04,112.0,"Drama, Music, Romance",Robert Altman,"Neve Campbell (story), Barbara Turner (story),...","Neve Campbell, Malcolm McDowell, James Franco,...",Ensemble drama centered around a group of ball...,English,"Germany, USA",6.3,tt0335013,movie,17769,72.000000,English Available
