In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel


def add_dummies(df: pd.DataFrame, Y: pd.Series):
    
    mlb = MultiLabelBinarizer()
    # Keywords: (10003 unique)
    dummy_keywords =pd.DataFrame(mlb.fit_transform(df['Keywords.id'].apply(lambda x: x if x else tuple())),
                                    columns=[f"keyword_{kw_id}" for kw_id in mlb.classes_], 
                                    index=df.index)

    dummy_keywords = dummy_keywords.loc[:, dummy_keywords.sum() > 1] # Removes keyword with 1 appearence

    tree_model = ExtraTreesClassifier(n_estimators=50)
    tree_model = tree_model.fit(dummy_keywords, Y)
    
    model = SelectFromModel(tree_model, prefit=True)
    X_new = model.transform(dummy_keywords)
    important_keywords = dummy_keywords.loc[:, model.get_support()] # Returns only selected features only
    df = pd.concat([df, important_keywords], axis=1)

    dummy_genres = pd.DataFrame(mlb.fit_transform(df.genres),
                            columns=[f"genre_{cl}" for cl in mlb.classes_], 
                            index=df.index)
    df = pd.concat([df, dummy_genres], axis=1)

    dummy_companies = pd.DataFrame(mlb.fit_transform(df['production_companies.id'].apply(lambda x: x if x else tuple())),
                                columns=[f"company_{cl}" for cl in mlb.classes_], 
                                index=df.index)
    dummy_companies = dummy_companies.loc[:, dummy_companies.sum() > 1] # Removes companies with 1 appearence     
    df = pd.concat([df, dummy_companies], axis=1) # Maybe biggest company size is enough...   

    dummy_countries =pd.DataFrame(mlb.fit_transform(df.production_countries),
                                columns=[f"country_{cl}" for cl in mlb.classes_], 
                                index=df.index)
    df = pd.concat([df, dummy_countries], axis=1)

    dummy_lang = pd.DataFrame(mlb.fit_transform(df.spoken_languages),
                            columns=[f"spoken_lang_{cl}" for cl in mlb.classes_], 
                            index=df.index)
    df = pd.concat([df, dummy_lang], axis=1)

    dummy_cast = pd.DataFrame(mlb.fit_transform(df['cast.id'].apply(lambda x: x if x else tuple())),
                                columns=[f"cast_{cl}" for cl in mlb.classes_], 
                                index=df.index)
    dummy_cast = dummy_cast.loc[:, dummy_cast.sum() > 20] # Removes cast with lower movies than 20  
    df = pd.concat([df, dummy_cast], axis=1)

    # Original Language dummy:
    dummy_orig_lang = pd.get_dummies(df.original_language, prefix="original_lang_")
    dummy_orig_lang = dummy_orig_lang.loc[:, dummy_orig_lang.sum() > 1]

    df = pd.concat([df, dummy_orig_lang], axis=1)     


def map_and_max(collection, mapping_dict):
    return max(map(mapping_dict.get, collection)) if collection else None

def eval_or_nan(obj):
    if obj and pd.notnull(obj) and isinstance(obj, str):
        return eval(obj)
    return None

def map_attribute(obj, attribute_name: str):
    if obj:
        iterable = eval(obj) if isinstance(obj, str) else obj
        return tuple(map(lambda x: x.get(attribute_name, None), iterable))
    return None

def features_flattening(df: pd.DataFrame):
    df['belongs_to_collection'] = df.belongs_to_collection.apply(eval_or_nan)
    df['belongs_to_collection.id'] = df.belongs_to_collection\
                                            .apply(lambda x: None if pd.isna(x) else x['id']).astype('Int64')


    df['genres'] = df.genres.apply(lambda gs: tuple(g['name'] for g in eval(gs)))

    df['production_companies'] = df.production_companies.apply(eval_or_nan)
    df['production_companies.id'] = df.production_companies\
                                            .apply(lambda companies: map_attribute(companies, 'id'))
    df['production_companies.origin_country'] = df.production_companies\
                                            .apply(lambda companies: map_attribute(companies, 'origin_country'))

    df['production_countries'] = df.production_countries.apply(lambda countries: map_attribute(countries, 'iso_3166_1'))

    df['release_date'] = pd.to_datetime(df.release_date)
    df['release_month'] = df.release_date.dt.month
    df['release_year'] = df.release_date.dt.year

    df['spoken_languages'] = df.spoken_languages.apply(lambda langs: map_attribute(langs, 'iso_639_1'))

    df['Keywords'] = df.Keywords.apply(eval_or_nan)
    df['Keywords.id'] =df.Keywords.apply(lambda keywords: map_attribute(keywords, 'id')) # TODO: Maybe keep words?

    df['cast'] = df.cast.apply(eval_or_nan)
    df['cast.id'] = df.cast.apply(lambda actors: map_attribute(actors, 'id'))
    df['cast.gender'] = df.cast.apply(lambda actors: map_attribute(actors, 'gender')) # Gender ratio

    df['crew'] = df.crew.apply(eval)
    df['crew.id'] = df.crew.apply(lambda crew: map_attribute(crew, 'id'))
    df['crew.gender'] = df.crew.apply(lambda crew: map_attribute(crew, 'gender')) # Gender ratio
    df['crew.department'] = df.crew.apply(lambda crew: map_attribute(crew, 'department')) # Dept size
    
    df.drop(['crew', 'cast', 'Keywords', 'belongs_to_collection', 'release_date'], axis=1, inplace=True)

def missing_value_imputation(df: pd.DataFrame):
    df.runtime.fillna(0, inplace=True)

def get_element_frequency(df, attribute):
    return Counter(df[attribute].dropna().sum())

# Gender actor ratio: 0 is unspecified, 1 is female, and 2 is male
def genders_ratio(genders):
    arr = np.array(genders)
    males = (arr == 1).sum()
    females = (arr == 2).sum()
    if males or females:
        return males / (females+males)
    return 0

def feature_extraction(df: pd.DataFrame):
    removed_columns = ['backdrop_path', 'homepage', 'poster_path', 'imdb_id', 'video']
    X = df[[col for col in df.columns if col not in removed_columns]].copy().set_index('id')

    Y = X['revenue'].copy()
    X.drop('revenue', axis=1, inplace=True)

    features_flattening(X)
    missing_value_imputation(X)
    
    X['collection_size'] = X.groupby('belongs_to_collection.id')['belongs_to_collection.id']\
                                    .transform('count').fillna(0).astype(int).copy()

    company_size_dict = get_element_frequency(X, 'production_companies.id') # {company_id : company_size}

    X['biggest_company_size'] = X['production_companies.id']\
                                        .apply(lambda companies: map_and_max(companies, company_size_dict))\
                                        .fillna(0).astype(int)

    id_country_set = set(X.production_companies
                        .apply(lambda xs: [(x['id'], x['origin_country']) for x in xs if x['origin_country']])
                        .sum()
                    )
                    
    company_per_country = Counter(country for comp_id, country in id_country_set)

    company_per_country[''] = 0 # Update no-countries to 0

    X['biggest_country'] = X['production_companies.origin_country']\
                                    .apply(lambda companies: map_and_max(companies, company_per_country))\
                                    .fillna(0).astype(int)

    country_size_dict = get_element_frequency(X, 'production_countries') # {country : movie_count}

    X['biggest_country_size'] = X['production_countries']\
                                        .apply(lambda countries: map_and_max(countries, country_size_dict))\
                                        .fillna(0).astype(int)


    X['cast.gender_ratio'] = X['cast.gender'].apply(genders_ratio)

    X['spoken_lang_num'] = X.spoken_languages.apply(len)

    X['overview_word_count'] = X.overview.apply(lambda x: len(x.split(" ")) if pd.notnull(x) else 0) # Overview word-count
    X['tagline_char_count'] = X.tagline.apply(lambda x: len(x) if pd.notnull(x) else 0) # tagline character-count
    X['title_char_count'] = X.title.apply(lambda x: len(x) if pd.notnull(x) else 0) # title character-count
                                        
    # Dept. size:

    dept_size_df = X['crew.department'].apply(lambda x: pd.Series(Counter(x)))\
                        .add_suffix('_depart_size')\
                        .astype('Int64')
    dept_size_df.dropna(axis=1, thresh= dept_size_df.shape[0] * 0.20, inplace=True) # Drop columns with less than 20% data
    dept_size_df.fillna(0, inplace=True) # Missing value imputation with 0
    X = pd.concat([X, dept_size_df], axis=1)

    add_dummies(X, Y)

    tuple_fields = ['genres', 'spoken_languages', 'production_countries', 'production_companies.id', 'Keywords.id', 'cast.id', 'cast.gender', 'crew.id', 'crew.gender', 'crew.department', 'belongs_to_collection.id', 'production_companies', 'production_companies.origin_country']
    text_fields = ['original_language', 'original_title', 'overview', 'status', 'tagline', 'title']

    X.drop(tuple_fields+text_fields, axis=1, inplace=True)

    return X, Y



In [7]:
train_raw = pd.read_csv('data/train.tsv',delimiter='\t')
test_raw = pd.read_csv('data/test.tsv',delimiter='\t')

In [8]:
train_X, train_Y = feature_extraction(train_raw)

In [9]:
test_X, test_Y = feature_extraction(test_raw)

In [10]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier

clf = GradientBoostingClassifier().fit(train_X, train_Y)
pred = clf.predict(test_X)

from sklearn.metrics import mean_squared_log_error
np.sqrt(mean_squared_log_error(test_Y, pred))

2.4260436358907187