In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel

TRAIN_COLUMNS = dict()

def nan_to_tuple(x):
    return x if x else tuple()

def add_dummies_test(df: pd.DataFrame):
    mlb = MultiLabelBinarizer()

    # Keywords: (10003 unique)
    dummy_keywords = pd.DataFrame(mlb.fit_transform(df['Keywords.id'].apply(nan_to_tuple)),
                                    columns=[f"keyword_{kw_id}" for kw_id in mlb.classes_], 
                                    index=df.index)
    # keyword_hist = dummy_keywords.sum()
    # dummy_keywords = dummy_keywords.loc[:, keyword_hist > keyword_hist.quantile(0.99) ] # Top 1% keywords
    dummy_keywords = dummy_keywords[TRAIN_COLUMNS['dummy_keywords']]
    df = pd.concat([df, dummy_keywords], axis=1)

    # Genres:
    dummy_genres = pd.DataFrame(mlb.fit_transform(df.genres.apply(nan_to_tuple)),
                            columns=[f"genre_{cl}" for cl in mlb.classes_], 
                            index=df.index)
    df = pd.concat([df, dummy_genres], axis=1)

    # Companies:
    dummy_companies = pd.DataFrame(mlb.fit_transform(df['production_companies.id'].apply(nan_to_tuple)),
                                columns=[f"company_{cl}" for cl in mlb.classes_], 
                                index=df.index)
    dummy_companies = dummy_companies[TRAIN_COLUMNS['dummy_companies']]
    df = pd.concat([df, dummy_companies], axis=1) # Maybe biggest company size is enough...   

    # Production countries:
    dummy_countries =pd.DataFrame(mlb.fit_transform(df.production_countries.apply(nan_to_tuple)),
                                columns=[f"country_{cl}" for cl in mlb.classes_], 
                                index=df.index)
    dummy_countries = dummy_countries[TRAIN_COLUMNS['dummy_countries']]                                
    df = pd.concat([df, dummy_countries], axis=1)

    # Spoken Languages:
    dummy_lang = pd.DataFrame(mlb.fit_transform(df.spoken_languages.apply(nan_to_tuple)),
                            columns=[f"spoken_lang_{cl}" for cl in mlb.classes_], 
                            index=df.index)
    dummy_lang = dummy_lang[TRAIN_COLUMNS['dummy_lang']]                            
    df = pd.concat([df, dummy_lang], axis=1)

    # Cast:
    # dummy_cast = pd.DataFrame(mlb.fit_transform(df['cast.id'].apply(nan_to_tuple)),
    #                             columns=[f"cast_{cl}" for cl in mlb.classes_], 
    #                             index=df.index)
    # cast_hist = dummy_cast.sum()                                
    # dummy_cast = dummy_cast.loc[:, cast_hist > 20] # Removes cast with lower movies than 20  
    # df = pd.concat([df, dummy_cast], axis=1)

    # Original Language dummy:
    dummy_orig_lang = pd.get_dummies(df.original_language, prefix="original_lang")
    dummy_orig_lang = dummy_orig_lang[TRAIN_COLUMNS['dummy_orig_lang']]
    df = pd.concat([df, dummy_orig_lang], axis=1)

    return df


def add_dummies_train(df: pd.DataFrame):
    mlb = MultiLabelBinarizer()

    # Keywords: (10003 unique)
    dummy_keywords = pd.DataFrame(mlb.fit_transform(df['Keywords.id'].apply(nan_to_tuple)),
                                    columns=[f"keyword_{kw_id}" for kw_id in mlb.classes_], 
                                    index=df.index)
    # keyword_hist = dummy_keywords.sum()
    # dummy_keywords = dummy_keywords.loc[:, keyword_hist > keyword_hist.quantile(0.99) ] # Top 1% keywords
    dummy_keywords = dummy_keywords[dummy_keywords.sum().nlargest(20).index]
    df = pd.concat([df, dummy_keywords], axis=1)
    TRAIN_COLUMNS['dummy_keywords'] = dummy_keywords.columns

    # Genres:
    dummy_genres = pd.DataFrame(mlb.fit_transform(df.genres.apply(nan_to_tuple)),
                            columns=[f"genre_{cl}" for cl in mlb.classes_], 
                            index=df.index)
    df = pd.concat([df, dummy_genres], axis=1)

    # Companies:
    dummy_companies = pd.DataFrame(mlb.fit_transform(df['production_companies.id'].apply(nan_to_tuple)),
                                columns=[f"company_{cl}" for cl in mlb.classes_], 
                                index=df.index)
    dummy_companies = dummy_companies[dummy_companies.sum().nlargest(10).index]
    df = pd.concat([df, dummy_companies], axis=1) # Maybe biggest company size is enough...   
    TRAIN_COLUMNS['dummy_companies'] = dummy_companies.columns

    # Production countries:
    dummy_countries =pd.DataFrame(mlb.fit_transform(df.production_countries.apply(nan_to_tuple)),
                                columns=[f"country_{cl}" for cl in mlb.classes_], 
                                index=df.index)
    dummy_countries = dummy_countries[dummy_countries.sum().nlargest(10).index]                                
    df = pd.concat([df, dummy_countries], axis=1)
    TRAIN_COLUMNS['dummy_countries'] = dummy_countries.columns

    # Spoken Languages:
    dummy_lang = pd.DataFrame(mlb.fit_transform(df.spoken_languages.apply(nan_to_tuple)),
                            columns=[f"spoken_lang_{cl}" for cl in mlb.classes_], 
                            index=df.index)
    dummy_lang = dummy_lang[dummy_lang.sum().nlargest(10).index]                            
    df = pd.concat([df, dummy_lang], axis=1)
    TRAIN_COLUMNS['dummy_lang'] = dummy_lang.columns

    # Cast:
    # dummy_cast = pd.DataFrame(mlb.fit_transform(df['cast.id'].apply(nan_to_tuple)),
    #                             columns=[f"cast_{cl}" for cl in mlb.classes_], 
    #                             index=df.index)
    # cast_hist = dummy_cast.sum()                                
    # dummy_cast = dummy_cast.loc[:, cast_hist > 20] # Removes cast with lower movies than 20  
    # df = pd.concat([df, dummy_cast], axis=1)

    # Original Language dummy:
    dummy_orig_lang = pd.get_dummies(df.original_language, prefix="original_lang")
    dummy_orig_lang = dummy_orig_lang[dummy_orig_lang.sum().nlargest(10).index]
    df = pd.concat([df, dummy_orig_lang], axis=1)
    TRAIN_COLUMNS['dummy_orig_lang'] = dummy_orig_lang.columns

    return df


def map_and_max(collection, mapping_dict):
    return max(map(mapping_dict.get, collection)) if collection else None

def eval_or_nan(obj):
    if obj and pd.notnull(obj) and isinstance(obj, str):
        return eval(obj)
    return None

def map_attribute(obj, attribute_name: str):
    if obj:
        iterable = eval(obj) if isinstance(obj, str) else obj
        return tuple(map(lambda x: x.get(attribute_name, None), iterable))
    return None

def smart_len(x, split_char= None):
    if split_char:
        return len(x.split(" ")) if pd.notnull(x) else 0
    return len(x) if pd.notnull(x) else 0

def features_flattening(df: pd.DataFrame):
    df['belongs_to_collection'] = df.belongs_to_collection.apply(eval_or_nan)
    df['belongs_to_collection.id'] = df.belongs_to_collection\
                                            .apply(lambda x: None if pd.isna(x) else x['id']).astype('Int64')


    df['genres'] = df.genres.apply(lambda gs: tuple(g['name'] for g in eval(gs)))

    df['production_companies'] = df.production_companies.apply(eval_or_nan)
    df['production_companies.id'] = df.production_companies\
                                            .apply(lambda companies: map_attribute(companies, 'id'))
    df['production_companies.origin_country'] = df.production_companies\
                                            .apply(lambda companies: map_attribute(companies, 'origin_country'))

    df['production_countries'] = df.production_countries.apply(lambda countries: map_attribute(countries, 'iso_3166_1'))

    df['release_date'] = pd.to_datetime(df.release_date)
    df['release_month'] = df.release_date.dt.month
    df['release_quarter'] = df.release_date.dt.quarter
    df['release_year'] = df.release_date.dt.year

    df['spoken_languages'] = df.spoken_languages.apply(lambda langs: map_attribute(langs, 'iso_639_1'))

    df['Keywords'] = df.Keywords.apply(eval_or_nan)
    df['Keywords.id'] =df.Keywords.apply(lambda keywords: map_attribute(keywords, 'id')) # TODO: Maybe keep words?

    df['cast'] = df.cast.apply(eval_or_nan)
    df['cast.id'] = df.cast.apply(lambda actors: map_attribute(actors, 'id'))
    df['cast.gender'] = df.cast.apply(lambda actors: map_attribute(actors, 'gender')) # Gender ratio

    df['crew'] = df.crew.apply(eval)
    df['crew.id'] = df.crew.apply(lambda crew: map_attribute(crew, 'id'))
    df['crew.gender'] = df.crew.apply(lambda crew: map_attribute(crew, 'gender')) # Gender ratio
    df['crew.department'] = df.crew.apply(lambda crew: map_attribute(crew, 'department')) # Dept size
    
    df.drop(['crew', 'cast', 'Keywords', 'belongs_to_collection', 'release_date'], axis=1, inplace=True)

    return df

def missing_value_imputation(df: pd.DataFrame):
    from sklearn.impute import KNNImputer

    df.budget.fillna(0, inplace=True)
    df.budget.replace(0, -1, inplace= True)
    
    df.runtime.fillna(0, inplace=True)
    df.runtime.replace(0, -1, inplace= True)

    imputer = KNNImputer(missing_values= -1)
    imputed = imputer.fit_transform(df)

    return pd.DataFrame(imputed, columns=df.columns,index=df.index)

def get_element_frequency(df, attribute):
    return Counter(df[attribute].dropna().sum())

# Gender actor ratio: 0 is unspecified, 1 is female, and 2 is male
def genders_ratio(genders):
    arr = np.array(genders)
    males = (arr == 1).sum()
    females = (arr == 2).sum()
    if males or females:
        return males / (females + males)
    return 0

def logarithmic_scaling(df):
    df['budget'] = df.budget.transform(np.log1p)
    df['revenue'] = df.revenue.transform(np.log1p)
    return df

def feature_extraction(df: pd.DataFrame, test: bool):
    removed_columns = ['backdrop_path', 'homepage', 'poster_path', 'imdb_id', 'video']
    X = df[[col for col in df.columns if col not in removed_columns]].copy().set_index('id')

    # Log scale big numbers:
    X = logarithmic_scaling(X)

    Y = X['revenue'].copy()
    X.drop('revenue', axis=1, inplace=True)

    # Flatten and extract:
    X = features_flattening(X)
    
    # Collection size:
    X['collection_size'] = X.groupby('belongs_to_collection.id')['belongs_to_collection.id']\
                                    .transform('count').fillna(0).astype(int).copy()

    # Company with most productions: (In data)
    company_size_dict = get_element_frequency(X, 'production_companies.id') # {company_id : company_size}
    X['biggest_production_company_size'] = X['production_companies.id']\
                                        .apply(lambda companies: map_and_max(companies, company_size_dict))\
                                        .fillna(0).astype(int)

    # Country with most production companies
    id_country_set = set(X.production_companies
                        .apply(lambda xs: [(x['id'], x['origin_country']) for x in xs if x['origin_country']])
                        .sum())
    company_per_country = Counter(country for comp_id, country in id_country_set)
    company_per_country[''] = 0 # Update no-countries to 0
    X['most_companies_country_size'] = X['production_companies.origin_country']\
                                    .apply(lambda companies: map_and_max(companies, company_per_country))\
                                    .fillna(0).astype(int)
    
    # Largest production country size:
    country_size_dict = get_element_frequency(X, 'production_countries') # {country : movie_count}
    X['most_productions_country_size'] = X['production_countries']\
                                        .apply(lambda countries: map_and_max(countries, country_size_dict))\
                                        .fillna(0).astype(int)
    # Males/ Females+Males ratio:
    X['cast.gender_ratio'] = X['cast.gender'].apply(genders_ratio)

    # Num of spoken languages
    X['spoken_lang_num'] = X.spoken_languages.apply(len)

    # Word\Char count:
    X['overview_word_count'] = X.overview.apply(lambda x: smart_len(x, ' ')) # Overview word-count
    X['tagline_char_count'] = X.tagline.apply(smart_len) # tagline character-count
    X['title_char_count'] = X.title.apply(smart_len) # title character-count

    # Cast size:
    X['cast_size'] = X['cast.id'].apply(smart_len)

    # Crew size:
    X['crew_size'] = X['crew.id'].apply(smart_len)

    # Dept. size:
    dept_size_df = X['crew.department'].apply(lambda x: pd.Series(Counter(x)))\
                        .add_suffix('_depart_size')\
                        .astype('Int64')
    dept_size_df.dropna(axis=1, thresh= dept_size_df.shape[0] * 0.20, inplace=True) # Drop columns with less than 20% data
    dept_size_df.fillna(0, inplace=True) # Missing value imputation with 0
    X = pd.concat([X, dept_size_df], axis=1)

    # Mean by years:
    mean_by_year = X.groupby("release_year")[['runtime', 'budget', 'popularity']]\
                        .aggregate('mean')\
                        .rename(columns= {  'runtime' : 'avg_runtime_by_year',
                                            'budget' : 'avg_budget_by_year',
                                            'popularity' : 'avg_popularity_by_year'})
    X = X.join(mean_by_year, how='left', on='release_year')

    # Original title changed:
    X['title_changed'] = (X['original_title'] != X['title'])

    # Add dummies:
    if test:
        X = add_dummies_test(X)
    else:
        X = add_dummies_train(X)

    # Drop unneccesry fields:
    tuple_fields = ['genres', 'spoken_languages', 'production_countries', 'production_companies.id', 'Keywords.id', 'cast.id', 'cast.gender', 'crew.id', 'crew.gender', 'crew.department', 'belongs_to_collection.id', 'production_companies', 'production_companies.origin_country']
    text_fields = ['original_language', 'original_title', 'overview', 'status', 'tagline', 'title']

    X.drop(tuple_fields+text_fields, axis=1, inplace=True)

    X = missing_value_imputation(X)

    return X, Y


In [2]:
train_raw = pd.read_csv('data/train.tsv',delimiter='\t')
train_X, train_Y = feature_extraction(train_raw, False)

In [3]:
test_raw = pd.read_csv('data/test.tsv',delimiter='\t')
test_X, test_Y = feature_extraction(test_raw, True)

In [4]:
train_X.columns[34:]

Index(['keyword_818', 'keyword_187056', 'keyword_9826', 'keyword_179431',
       'keyword_242', 'keyword_14819', 'keyword_5565', 'keyword_9672',
       'keyword_9663', 'keyword_10183', 'keyword_9748', 'keyword_6054',
       'keyword_9673', 'keyword_970', 'keyword_6149', 'keyword_6075',
       'keyword_380', 'keyword_4565', 'keyword_13130', 'keyword_179430',
       'genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Comedy',
       'genre_Crime', 'genre_Documentary', 'genre_Drama', 'genre_Family',
       'genre_Fantasy', 'genre_History', 'genre_Horror', 'genre_Music',
       'genre_Mystery', 'genre_Romance', 'genre_Science Fiction',
       'genre_TV Movie', 'genre_Thriller', 'genre_War', 'genre_Western',
       'company_174', 'company_33', 'company_4', 'company_5', 'company_25',
       'company_21', 'company_12', 'company_104', 'company_9195', 'company_2',
       'country_US', 'country_GB', 'country_FR', 'country_DE', 'country_CA',
       'country_IN', 'country_JP', 'country_IT

In [5]:
 set( test_X.columns ) - set( train_X.columns )

set()

In [6]:
diff = set( train_X.columns ) - set( test_X.columns )
diff

set()

In [7]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier

clf = RandomForestRegressor().fit(train_X, train_Y)
pred = clf.predict(test_X)

from sklearn.metrics import mean_squared_log_error
np.sqrt(mean_squared_log_error(test_Y, pred))

0.16380213633010557

In [8]:
# HyperParameters:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Number of trees in random forest
n_estimators = list(range(100, 4000, 100))

criterion = ['mse', 'mae']
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2', 0.2, 0.4, 0.6, 0.8]
# Maximum number of levels in tree
max_depth = list(range(10, 200, 10)) + [None]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 8, 16]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = { 'n_estimators': n_estimators,
                'criterion': criterion,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_X, train_Y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
rf_random.best_params_

In [None]:

pred = rf_random.best_estimator_.predict(test_X)
from sklearn.metrics import mean_squared_log_error
np.sqrt(mean_squared_log_error(test_Y, pred))

In [3]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.model_selection import GridSearchCV

loss = ['least_squares', 'least_absolute_deviation']
learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 1]
max_iter = [100, 200, 500, 1000, 1500, 1800, 2000, 3000]
max_leaf_nodes = [None, 31, 41, 53, 67, 79, 89, 97]
max_depth = [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]
min_samples_leaf = [2, 4, 16, 32, 64, 128, 256]
l2_regularization = [0, 0.001,  0.01, 0.1, 0.2, 0.4, 0.6, 1]
max_bins = [255, 2**7 - 1, 2**6 - 1, 2**5 - 1, 2**4 - 1, 2**3 - 1]

random_grid = {'loss': loss,
               'learning_rate': learning_rate,
               'max_iter': max_iter,
               'max_leaf_nodes': max_leaf_nodes,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'l2_regularization': l2_regularization,
               'max_bins': max_bins}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
hgb = HistGradientBoostingRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(estimator = hgb, param_grid = random_grid, cv = 3, verbose=2, n_jobs = 1, pre_dispatch='2*n_jobs')
# Fit the random search model
rf_random.fit(train_X, train_Y)

Fitting 3 folds for each of 4128768 candidates, totalling 12386304 fits
[CV] l2_regularization=0, learning_rate=0.001, loss=least_squares, max_bins=255, max_depth=None, max_iter=100, max_leaf_nodes=None, min_samples_leaf=2 
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV]  l2_regularization=0, learning_rate=0.001, loss=least_squares, max_bins=255, max_depth=None, max_iter=100, max_leaf_nodes=None, min_samples_leaf=2, total=  58.2s
[CV] l2_regularization=0, learning_rate=0.001, loss=least_squares, max_bins=255, max_depth=None, max_iter=100, max_leaf_nodes=None, min_samples_leaf=2 
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   58.2s remaining:    0.0s
[CV]  l2_regularization=0, learning_rate=0.001, loss=least_squares, max_bins=255, max_depth=None, max_iter=100, max_leaf_nodes=None, min_samples_leaf=2, total=   0.7s
[CV] l2_regularization=0, learning_rate=0.001, loss=least_squares, max_bins=255, max_depth=None, max_iter=100, max_leaf_nodes=Non

MemoryError: Unable to allocate 47.3 MiB for an array with shape (3565, 1738) and data type float64

In [None]:
from sklearn.impute import KNNImputer

train_X.budget.replace(0, -1, inplace= True)

In [None]:
imputer = KNNImputer(missing_values= -1)
imputed = imputer.fit_transform(train_X)

In [None]:
train_X.shape

In [None]:
imputed_train_X = pd.DataFrame(imputed, columns=train_X.columns,index=train_X.index)

In [None]:
train_X.budget

In [None]:
imputed_train_X.budget