In [5]:
# # Mount Data folder:

# from google.colab import drive
# drive.mount('/content/drive')

MOVIE = '/home/nghia/Desktop/Lab/MovieRevenuePredicter'

In [4]:
!pwd

/home/nghia/Desktop/Lab/MovieRevenuePredicter


In [12]:
# import necessary libs:
import os
import gc
import re
import csv
import json
import time
import pickle
# import itertools
import numpy as np
import pandas as pd
from re import sub
from decimal import Decimal
# import seaborn as sns
# from statistics import mode
import lightgbm as lgb
import catboost as cbt
import matplotlib.pyplot as plt
# from sklearn.metrics import confusion_matrix
# from sklearn.preprocessing import LabelEncoder
from datetime import datetime, timedelta, date, timezone

### Parse functions:

In [9]:
def get_id(link):
  """
  Get id of movies/person from url
  
  _____
  Params:
    - link: an url to movie/person detail
    
  _____
  Note:
    Return '-' if input link is incorrect
  """
  try:
    return re.findall(r'id=(.*?).htm', link)[0]
  except:
    return '-'
  

def money_str2float(s):
  """
  Convert money string to float
  
  _____
  Params:
    - s: money string like: "$123" or "$3 million"
    
  _____
  Notes:
    Return None if input time is 'N/A' or '-'
  """
  if s == '-' or s == 'N/A':
    return None
  if 'million' in s:
    return float(sub(r'[^\d.]', '', s)) * 1e6
  else:
    return float(sub(r'[^\d.]', '', s))
  
  
def parse_datetime(dt, patent='%B %d, %Y'):
  """
  Convert time string to datetime datatype
  
  _____
  Params:
    - dt: datetime string
    - patent: format of time-string, default: '%B %d, %Y'
    
  _____
  Notes:
    Return None if input time is 'N/A' or '-'
  """
  try:
    return datetime.strptime(dt, patent)
  except:
    return None
  
  
def time_str2mins(t):
  """
  Convert runtime string to number of minute
  
  _____
  Params:
    - t: timedelta like '1 hours 15 mins'
    
  _____
  Notes:
    Return None if input time is 'N/A'
  """
  if t == 'N/A':
    return None
  tmp = re.findall(r'[\d]+', t)
  if len(tmp) == 1:
    return int(tmp[0])
  else:
    return int(tmp[0]) * 60 + int(tmp[1])

## 1. Movies

### Load movies data:

In [6]:
movies_dir = MOVIE+'/Movies/'
movies = [movies_dir + movie for movie in os.listdir(movies_dir)]

movies_as_json = []
for movie in movies:
    with open(movie, 'r') as f:
    try:
        movies_as_json.append(json.load(f))
    except:
        continue
        
df = pd.DataFrame(movies_as_json)

### Remove bad rows

In [7]:
# df = pd.DataFrame(movies_as_json)
df = df[df.genre.apply(lambda x: x not in ['Unrated', 'PG-13', 'Not Yet Rated', 'R'])]
df = df[~((df.revenue == '-') & (df.domestic == '-'))]
df.head()

Unnamed: 0,actor,budget,composer,director,domestic,genre,id,lastseen,mpaa_rating,name,producer,release_date,revenue,runtime,writer
0,[],,-,[],"$12,534,817",Horror,amityville2,-,R,Amityville II:\nThe Possession,[],"September 24, 1982",-,1 hrs. 44 min.,[]
1,[],,-,[https://www.boxofficemojo.com/people/chart/?v...,"$44,722",Drama,warathome,-,R,The War at Home,[],"November 22, 1996",-,1 hrs. 59 min.,[]
2,[],,-,[],"$6,646",Foreign,darkbluealmostblack,-,Unrated,DarkBlueAlmostBlack,[],"October 19, 2007","$2,171,323",1 hrs. 45 min.,[]
3,[],,-,[],"$137,221",Horror Comedy,severance,-,R,Severance,[],"May 18, 2007","$5,515,163",1 hrs. 36 min.,[]
4,[https://www.boxofficemojo.com/people/chart/?v...,,,[https://www.boxofficemojo.com/people/chart/?v...,"$63,270,710",Comedy,dave,-,PG-13,Dave,[https://www.boxofficemojo.com/people/chart/?v...,"May 7, 1993",-,1 hrs. 50 min.,[https://www.boxofficemojo.com/people/chart/?v...


### Convert string columns to correct data type:

In [13]:
df.budget = df.budget.apply(money_str2float)
df.domestic = df.domestic.apply(money_str2float)
df.revenue = df.revenue.apply(money_str2float)

df.release_date = df.release_date.apply(parse_datetime)
df.runtime = df.runtime.apply(time_str2mins)

for feat in ['actor', 'director', 'producer', 'writer']:
    df[feat] = df[feat].apply(lambda x: [get_id(item) for item in x])

df.composer.fillna('-', inplace=True)
df.composer = df.composer.apply(get_id)


df.head()

Unnamed: 0,actor,budget,composer,director,domestic,genre,id,lastseen,mpaa_rating,name,producer,release_date,revenue,runtime,writer
0,[],,-,[],12534817.0,Horror,amityville2,-,R,Amityville II:\nThe Possession,[],1982-09-24,,104.0,[]
1,[],,-,[kathybates],44722.0,Drama,warathome,-,R,The War at Home,[],1996-11-22,,119.0,[]
2,[],,-,[],6646.0,Foreign,darkbluealmostblack,-,Unrated,DarkBlueAlmostBlack,[],2007-10-19,2171323.0,105.0,[]
3,[],,-,[],137221.0,Horror Comedy,severance,-,R,Severance,[],2007-05-18,5515163.0,96.0,[]
4,"[kevinkline, sigourneyweaver, franklangella, b...",,-,[ivanreitman],63270710.0,Comedy,dave,-,PG-13,Dave,"[laurenschulerdonner, joemedjuck, ivanreitman]",1993-05-07,,110.0,[garyross]


## 2. Person:

### Load person data:

In [48]:
persons_dir = MOVIE+'/Persons/'
persons = [persons_dir + person for person in os.listdir(persons_dir)]

persons_as_json = []
for person in persons:
    with open(person, 'r') as f:
        try:
            persons_as_json.append(json.load(f))
        except:
            continue
df1 = pd.DataFrame(persons_as_json)

In [49]:
df1 = pd.DataFrame(persons_as_json)
df1.head()

Unnamed: 0,average,id,movies,name,role
0,24993931,naomiwatts,"[{'date': '8/11/17', 'title': 'theglasscastle'...",Naomi Watts,Actor
1,22348241,andrenemec,"[{'date': '1/30/15', 'title': 'almanac', 'stud...",Andre Nemec,Producer
2,707343,leonardodicaprio,"[{'date': '8/17/07', 'title': '11thhour', 'stu...",Leonardo DiCaprio,Writer
3,1064454,jimwilson,"[{'date': '3/21/14', 'title': '50to1', 'studio...",Jim Wilson,Writer
4,73969834,margotrobbie,"[{'date': '12/7/18', 'title': 'maryqueenofscot...",Margot Robbie,Actor


### Processing persons data:

In [17]:
df1['studio'] = df1.movies.apply(lambda x: [movie['studio'] for movie in x])
df1.movies = df1.movies.apply(lambda x: [movie['title'] for movie in x])

df1.average = df1.average.apply(money_str2float)
df1.head()

Unnamed: 0,average,id,movies,name,role,studio
0,24993931.0,naomiwatts,"[theglasscastle, thebookofhenry, chuck, aboutr...",Naomi Watts,Actor,"[lionsgate, focus, ifc, weinsteincompany, euro..."
1,22348241.0,andrenemec,[almanac],Andre Nemec,Producer,[paramount]
2,707343.0,leonardodicaprio,[11thhour],Leonardo DiCaprio,Writer,[wip]
3,1064454.0,jimwilson,[50to1],Jim Wilson,Writer,[tenfurlongs]
4,73969834.0,margotrobbie,"[maryqueenofscots, peterrabbit, itonya, goodby...",Margot Robbie,Actor,"[focus, sony, neon, foxsearchlight, warnerbros..."


### In process:

In [29]:
# c_actors = df1.id.values
# actors = []
# for l_actor in df.actor.values.tolist():
#     actors += l_actor
# actors = np.unique((np.array(actors)))
# with open('more_people.txt', 'w') as f:
#     for act in np.setdiff1d(actors, c_actors):
#         f.write(act + '\n')

In [258]:
stats = ['avg', 'max', 'min', 'med', 'std']
person_feats = ['num_film', 'years'] + ['gross_' + stat for stat in stats] + \
                ['gross_last3_' + stat for stat in stats] + \
                ['gross_last5_' + stat for stat in stats] + \
                ['gross_top3_' + stat for stat in stats]  +  \
                                                            \
                ['opening_' + stat for stat in stats]       + \
                ['opening_last3_' + stat for stat in stats] + \
                ['opening_last5_' + stat for stat in stats] + \
                ['opening_top3_' + stat for stat in stats]



def date2str(_date):
    return str(_date.day) + '/' + str(_date.month) + '/' +str(_date.year)
def str2date(s):
    date_str = s.split('/')
    
    if len(date_str) == 3:
        year = 2000 + int(date_str[-1])
        if year > 2018: year -= 100

        month = int(date_str[0])
        day = int(date_str[1])
    else:
        year = int(re.findall(r'[\d]+', s)[0])
        month = 6
        day = 1
    
    return date(year,month,day)


def get_person_info(person_json, to_date):
    tmp = pd.DataFrame(person_json)
    tmp = tmp[tmp['lifetimeGross'] != '/a']
    tmp = tmp[tmp['opening'] != '/a']
    tmp = tmp[tmp['date'] != 'N/A']
    tmp['date'] = tmp['date'].apply(str2date)
    
    tmp = tmp[tmp['date'] < to_date]
    if tmp.empty:
        return [None]* len(person_feats)
    tmp.lifetimeGross = tmp.lifetimeGross.astype('float')
    tmp.opening = tmp.opening.astype('float')
    num_film = tmp.shape[0]
    years = 1 + int(tmp.date.values[0].year) - int(tmp.date.values[-1].year)
    try:
        top3 = tmp.sort_values('lifetimeGross', ascending=False).lifetimeGross.values[2]
    except:
        top3 = tmp.lifetimeGross.min()
    
    gross = []
    opening = []
    for df in [tmp, tmp.head(3), tmp.head(5), tmp[tmp.lifetimeGross >= top3]]:
        gross += [df.lifetimeGross.values.mean(), df.lifetimeGross.values.max(),\
                df.lifetimeGross.values.min(), np.median(df.lifetimeGross.values),\
                np.std(df.lifetimeGross.values)]
        opening += [df.opening.values.mean(), df.opening.values.max(),\
                df.opening.values.min(), np.median(df.opening.values),\
                np.std(df.opening.values)]
        
    return [num_film, years] + gross + opening

In [259]:
info_list = []

df2 = df1[['id', 'movies', 'role']]
for value in df2.values:
    for film in value[1]:
        info_list.append([value[0], film['date'], film['title'], value[-1], value[1]])

In [260]:
df_info = pd.DataFrame(info_list)
df_info.columns = ['person_name', 'date', 'movie', 'role', 'movie_list']
df_info = df_info[df_info['date'] != 'N/A']
df_info.head(3)

Unnamed: 0,person_name,date,movie,role,movie_list
0,naomiwatts,8/11/17,theglasscastle,Actor,"[{'date': '8/11/17', 'title': 'theglasscastle'..."
1,naomiwatts,6/16/17,thebookofhenry,Actor,"[{'date': '8/11/17', 'title': 'theglasscastle'..."
2,naomiwatts,5/5/17,chuck,Actor,"[{'date': '8/11/17', 'title': 'theglasscastle'..."


In [261]:
df_info['more_feats'] = df_info.apply(lambda x: get_person_info(x[-1], str2date(x[1])), axis=1)
df_info.head()

Unnamed: 0,person_name,date,movie,role,movie_list,more_feats
0,naomiwatts,8/11/17,theglasscastle,Actor,"[{'date': '8/11/17', 'title': 'theglasscastle'...","[41, 26, 23290240.17073171, 218080025.0, 20444..."
1,naomiwatts,6/16/17,thebookofhenry,Actor,"[{'date': '8/11/17', 'title': 'theglasscastle'...","[40, 26, 23759871.825, 218080025.0, 20444.0, 7..."
2,naomiwatts,5/5/17,chuck,Actor,"[{'date': '8/11/17', 'title': 'theglasscastle'...","[38, 25, 24997847.684210528, 218080025.0, 2044..."
3,naomiwatts,5/5/17,aboutray,Actor,"[{'date': '8/11/17', 'title': 'theglasscastle'...","[38, 25, 24997847.684210528, 218080025.0, 2044..."
4,naomiwatts,11/11/16,shutin,Actor,"[{'date': '8/11/17', 'title': 'theglasscastle'...","[37, 25, 25486969.64864865, 218080025.0, 20444..."


In [262]:
# tmp = pd.DataFrame(df_info.movie_list.values[0])
# tmp = tmp[tmp['lifetimeGross'] != '/a']
# tmp = tmp[tmp['opening'] != '/a']
# tmp['date'] = tmp['date'].apply(str2date)

# tmp[tmp['date'] < str2date('6/16/17')]

In [263]:
for i in range(len(person_feats)):
    df_info[person_feats[i]] = df_info.more_feats.apply(lambda x: x[i])
df_info.drop(['date', 'movie_list', 'more_feats'], axis=1, inplace=True)

save_obj(df_info, 'obj/people_feats')
df_info.head()

Unnamed: 0,person_name,movie,role,num_film,years,gross_avg,gross_max,gross_min,gross_med,gross_std,...,opening_last5_avg,opening_last5_max,opening_last5_min,opening_last5_med,opening_last5_std,opening_top3_avg,opening_top3_max,opening_top3_min,opening_top3_med,opening_top3_std
0,naomiwatts,theglasscastle,Actor,41.0,26.0,23290240.0,218080025.0,20444.0,7220243.0,43370340.0,...,1035952.8,3613567.0,1877.0,105215.0,1395239.0,39136406.0,52263680.0,15015393.0,50130145.0,17078360.0
1,naomiwatts,thebookofhenry,Actor,40.0,26.0,23759870.0,218080025.0,20444.0,7403864.0,43806030.0,...,971053.2,3613567.0,1877.0,105215.0,1383149.0,39136406.0,52263680.0,15015393.0,50130145.0,17078360.0
2,naomiwatts,chuck,Actor,38.0,25.0,24997850.0,218080025.0,20444.0,7823986.0,44601730.0,...,6794104.4,29027348.0,1877.0,1100042.0,11190240.0,39136406.0,52263680.0,15015393.0,50130145.0,17078360.0
3,naomiwatts,aboutray,Actor,38.0,25.0,24997850.0,218080025.0,20444.0,7823986.0,44601730.0,...,6794104.4,29027348.0,1877.0,1100042.0,11190240.0,39136406.0,52263680.0,15015393.0,50130145.0,17078360.0
4,naomiwatts,shutin,Actor,37.0,25.0,25486970.0,218080025.0,20444.0,8060487.0,45099760.0,...,16524127.0,52263680.0,1877.0,1100042.0,21024310.0,39136406.0,52263680.0,15015393.0,50130145.0,17078360.0


In [264]:
df_tmp = load_obj('obj/people_feats')

In [267]:
df_tmp.head(20)

Unnamed: 0,person_name,movie,role,num_film,years,gross_avg,gross_max,gross_min,gross_med,gross_std,...,opening_last5_avg,opening_last5_max,opening_last5_min,opening_last5_med,opening_last5_std,opening_top3_avg,opening_top3_max,opening_top3_min,opening_top3_med,opening_top3_std
0,naomiwatts,theglasscastle,Actor,41.0,26.0,23290240.0,218080025.0,20444.0,7220243.0,43370340.0,...,1035952.8,3613567.0,1877.0,105215.0,1395239.0,39136410.0,52263680.0,15015393.0,50130145.0,17078360.0
1,naomiwatts,thebookofhenry,Actor,40.0,26.0,23759870.0,218080025.0,20444.0,7403864.0,43806030.0,...,971053.2,3613567.0,1877.0,105215.0,1383149.0,39136410.0,52263680.0,15015393.0,50130145.0,17078360.0
2,naomiwatts,chuck,Actor,38.0,25.0,24997850.0,218080025.0,20444.0,7823986.0,44601730.0,...,6794104.4,29027348.0,1877.0,1100042.0,11190240.0,39136410.0,52263680.0,15015393.0,50130145.0,17078360.0
3,naomiwatts,aboutray,Actor,38.0,25.0,24997850.0,218080025.0,20444.0,7823986.0,44601730.0,...,6794104.4,29027348.0,1877.0,1100042.0,11190240.0,39136410.0,52263680.0,15015393.0,50130145.0,17078360.0
4,naomiwatts,shutin,Actor,37.0,25.0,25486970.0,218080025.0,20444.0,8060487.0,45099760.0,...,16524127.0,52263680.0,1877.0,1100042.0,21024310.0,39136410.0,52263680.0,15015393.0,50130145.0,17078360.0
5,naomiwatts,theseaoftrees,Actor,36.0,25.0,26194370.0,218080025.0,34410.0,8450470.0,45518930.0,...,16608631.0,52263680.0,227688.0,1100042.0,20958470.0,39136410.0,52263680.0,15015393.0,50130145.0,17078360.0
6,naomiwatts,demolition,Actor,35.0,25.0,26886250.0,218080025.0,34410.0,8840453.0,45977590.0,...,16410598.2,52263680.0,109878.0,424397.0,21108220.0,39136410.0,52263680.0,15015393.0,50130145.0,17078360.0
7,naomiwatts,allegiant,Actor,34.0,24.0,25730440.0,218080025.0,34410.0,8450470.0,46144950.0,...,10618679.4,52263680.0,67754.0,227688.0,20822870.0,39136410.0,52263680.0,15015393.0,50130145.0,17078360.0
8,naomiwatts,whilewereyoung,Actor,33.0,24.0,26280220.0,218080025.0,34410.0,8840453.0,46729060.0,...,10597103.0,52263680.0,67754.0,119806.0,20833680.0,39136410.0,52263680.0,15015393.0,50130145.0,17078360.0
9,naomiwatts,insurgent,Actor,32.0,23.0,23033380.0,218080025.0,34410.0,8450470.0,43634370.0,...,1105542.6,4805878.0,67754.0,119806.0,1854533.0,33403590.0,50130145.0,15015393.0,35065237.0,14383610.0
