In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from collections import Counter
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
def preprocess(df):
    '''
    converting strings to list
    '''
    print("preprocessing...")
    column_names = ['belongs_to_collection',
                    'genres',
                    'production_companies',
                    'production_countries',
                    'spoken_languages',
                    'Keywords',
                    'cast',
                    'crew'
                   ]
    for i in column_names:
        df[i] = df[i].replace(np.nan,"",regex=True)
        df[i] = df[i].apply(lambda x: literal_eval(x) if x!='' else [])
        
    return df

In [3]:
def feature_extraction(df):
    '''
    extracting different features from the dictionary
    '''
    print("feature extraction..")
    df['collection_name'] = df['belongs_to_collection'].apply(lambda x :x[0]['name'] if x != [] else 0)
    df['has_collection'] = df['belongs_to_collection'].apply(lambda x :1 if x != [] else 0)
    df = df.drop('belongs_to_collection',axis=1)
    df['genre_count'] = df['genres'].apply(lambda x: len(x) if x != [] else 0)
    df['genre_list'] = df['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != [] else '')
    
    list_of_genres = list(df['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
    genres_top = [m[0] for m in Counter([i for j in list_of_genres for i in j]).most_common(16)]
    for i in genres_top:
        df[i+'_genre'] = df['genre_list'].apply(lambda x: 1 if i in x else 0)
    df = df.drop(['genres'], axis=1)
    
    company_list = list(df['production_companies'].apply(lambda x: [i['name'] for i in x] if x != [] else []).values)
    df['num_companies'] = df['production_companies'].apply(lambda x: len(x) if x != {} else 0)
    df['all_production_companies'] = df['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
    top_companies = [m[0] for m in Counter([i for j in company_list for i in j]).most_common(40)]
    for g in top_companies:
        df['production_company_' + g] = df['all_production_companies'].apply(lambda x: 1 if g in x else 0)
    df = df.drop(['production_companies', 'all_production_companies'], axis=1)
    
    list_of_countries = list(df['production_countries'].apply(lambda x: [i['name'] for i in x] if x != [] else []).values)
    df['num_countries'] = df['production_countries'].apply(lambda x: len(x) if x != {} else 0)
    df['all_countries'] = df['production_countries'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
    top_countries = [m[0] for m in Counter([i for j in list_of_countries for i in j]).most_common(25)]
    for g in top_countries:
        df['production_country_' + g] = df['all_countries'].apply(lambda x: 1 if g in x else 0)
    df = df.drop(['production_countries', 'all_countries'], axis=1)
    
    list_of_languages = list(df['spoken_languages'].apply(lambda x: [i['name'] for i in x] if x != [] else []).values)
    df['num_languages'] = df['spoken_languages'].apply(lambda x: len(x) if x != {} else 0)
    df['all_languages'] = df['spoken_languages'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
    top_languages = [m[0] for m in Counter([i for j in list_of_languages for i in j]).most_common(15)]
    for g in top_languages:
        df['language_' + g] = df['all_languages'].apply(lambda x: 1 if g in x else 0)
    df = df.drop(['spoken_languages', 'all_languages'], axis=1)
    
    list_of_keywords = list(df['Keywords'].apply(lambda x: [i['name'] for i in x] if x != [] else []).values)
    df['num_keywords'] = df['Keywords'].apply(lambda x: len(x) if x != {} else 0)
    df['all_keywords'] = df['Keywords'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
    top_keywords = [m[0] for m in Counter([i for j in list_of_keywords for i in j]).most_common(30)]
    for g in top_keywords:
        df['keyword_' + g] = df['all_keywords'].apply(lambda x: 1 if g in x else 0)
    df = df.drop(['Keywords', 'all_keywords'], axis=1)
    
    list_of_cast_names = list(df['cast'].apply(lambda x: [i['name'] for i in x] if x != [] else []).values)
    list_of_cast_genders = list(df['cast'].apply(lambda x: [i['gender'] for i in x] if x != {} else []).values)
    list_of_cast_characters = list(df['cast'].apply(lambda x: [i['character'] for i in x] if x != {} else []).values)
    df['num_cast'] = df['cast'].apply(lambda x: len(x) if x != {} else 0)
    top_cast_names = [m[0] for m in Counter([i for j in list_of_cast_names for i in j]).most_common(15)]
    for g in top_cast_names:
        df['cast_name_' + g] = df['cast'].apply(lambda x: 1 if g in str(x) else 0)
    df['genders_0_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
    df['genders_1_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
    df['genders_2_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
    top_cast_characters = [m[0] for m in Counter([i for j in list_of_cast_characters for i in j]).most_common(15)]
    for g in top_cast_characters:
        df['cast_character_' + g] = df['cast'].apply(lambda x: 1 if g in str(x) else 0)
    df = df.drop(['cast'], axis=1)
    
    list_of_crew_names = list(df['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
    list_of_crew_jobs = list(df['crew'].apply(lambda x: [i['job'] for i in x] if x != {} else []).values)
    list_of_crew_genders = list(df['crew'].apply(lambda x: [i['gender'] for i in x] if x != {} else []).values)
    list_of_crew_departments = list(df['crew'].apply(lambda x: [i['department'] for i in x] if x != {} else []).values)
    list_of_crew_names = df['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values
    df['num_crew'] = df['crew'].apply(lambda x: len(x) if x != {} else 0)
    top_crew_names = [m[0] for m in Counter([i for j in list_of_crew_names for i in j]).most_common(15)]
    for g in top_crew_names:
        df['crew_name_' + g] = df['crew'].apply(lambda x: 1 if g in str(x) else 0)
    df['genders_0_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
    df['genders_1_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
    df['genders_2_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
    top_crew_jobs = [m[0] for m in Counter([i for j in list_of_crew_jobs for i in j]).most_common(15)]
    for j in top_crew_jobs:
        df['jobs_' + j] = df['crew'].apply(lambda x: sum([1 for i in x if i['job'] == j]))
    top_crew_departments = [m[0] for m in Counter([i for j in list_of_crew_departments for i in j]).most_common(15)]
    for j in top_crew_departments:
        df['departments_' + j] = df['crew'].apply(lambda x: sum([1 for i in x if i['department'] == j])) 
    df = df.drop(['crew'], axis=1)
    
    df['log_budget'] = np.log1p(df['budget'])
    df = df.drop('budget',axis=1)
    
    def f1(x):
        if x is not np.NaN:
            return 1
        else:
            return 0
    df['has_homepage'] = df['homepage'].apply(f1)
    
    df[['release_month','release_day','release_year']]=df['release_date'].str.split('/',expand=True).replace(np.nan, -1).astype(int)
    df.loc[ (df['release_year'] <= 19) & (df['release_year'] < 100), "release_year"] += 2000
    df.loc[ (df['release_year'] > 19)  & (df['release_year'] < 100), "release_year"] += 1900
    release_date = pd.to_datetime(df['release_date']) 
    df['release_day_of_week'] = release_date.dt.dayofweek
    df['release_quarter'] = release_date.dt.quarter
    
    df = df.drop(['homepage', 'imdb_id', 'poster_path', 'release_date', 'status'], axis=1)
    
    for col in ['original_language', 'collection_name', 'genre_list']:
        le = LabelEncoder()
        le.fit(list(df[col].fillna('')))
        df[col] = le.fit_transform(df[col].fillna('').astype(str))
    
    for col in ['title', 'tagline', 'overview', 'original_title']:
        df['len_' + col] = df[col].fillna('').apply(lambda x: len(str(x)))
        df['words_' + col] = df[col].fillna('').apply(lambda x: len(str(x.split(' '))))
        df = df.drop(col, axis=1)
    
    for col in df.columns:
        if df[col].nunique() == 1:
            df = df.drop([col], axis=1)
            
    df['popularity'] = np.log1p(df['popularity'])    
    
    return df


In [4]:
def predict(df):
    columns = list(pickle.load(open("columns.column", "rb")))
    #keep_columns = list(set(df.columns).intersection(set(columns)))
    for i in df.columns:
        if i not in columns:
            df = df.drop(i,axis=1)
    #mandatory_columns = list(set(columns).difference(set(df.columns)))
    for i in columns:
        if i not in df.columns:
            df[i] = 0
    df = df[columns]
    if 'revenue' in df.columns:
        df = df.drop('revenue',axis=1)
    pred_x = df.drop('id',axis=1)
    print("loading model..")
    model = pickle.load(open("xgboost.model", "rb"))
    print("model load successful..")
    pred_y = model.predict(xgb.DMatrix(pred_x), ntree_limit=model.best_ntree_limit)
    predictions = np.expm1(pred_y)
    return predictions

Test file path

In [5]:
df = pd.read_csv('mini_project/data.csv')

In [6]:
df = preprocess(df)

df = feature_extraction(df)

predictions = predict(df)

preprocessing...
feature extraction..
loading model..
model load successful..
