## Categorical Feature Engineering

Importing packages 

In [2]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

Loading pickled dataframe

In [3]:
earliestrelease_notdomestic_movies2 = pd.read_pickle('earliestrelease_notdomestic_movies2.pickle')

In [4]:
smaller_df = earliestrelease_notdomestic_movies2.loc[:,['lifetime_gross', 'max_theaters', 'domestic_opening', 
                      'num_opening_theaters','domestic_total_gross', 'international_total_gross', 
                      'earliest_release_location_opening_gross', 'earliest_release_location_original_gross']]

In [5]:
X = smaller_df.loc[:,['max_theaters','num_opening_theaters','international_total_gross', 'earliest_release_location_original_gross']]
y = smaller_df['domestic_total_gross']

In [6]:
def split_and_validate(X, y):
    '''
    For a set of features and target X, y, perform a 80/20 train/val split, 
    fit and validate a linear regression model, and report results
    '''
    
    # perform train/val split
    X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=0.2, random_state=42)
    
    # fit linear regression to training data
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    # score fit model on validation data
    val_score = lr_model.score(X_val, y_val)
    
    # report results
    print('\nValidation R^2 score was:', val_score)
    print('Feature coefficient results: \n')
    for feature, coef in zip(X.columns, lr_model.coef_):
        print(feature, ':', f'{coef:.2f}') 

In [7]:
earliestrelease_notdomestic_movies2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1659 entries, Hero to News from Planet Mars
Data columns (total 15 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   lifetime_gross                            1659 non-null   float64       
 1   max_theaters                              1659 non-null   float64       
 2   domestic_opening                          1659 non-null   float64       
 3   num_opening_theaters                      1659 non-null   float64       
 4   release_date                              1659 non-null   datetime64[ns]
 5   domestic_distributor                      1659 non-null   object        
 6   domestic_total_gross                      1659 non-null   float64       
 7   international_total_gross                 1659 non-null   float64       
 8   earliest_release_location                 1659 non-null   object        
 9   runtime        

In [8]:
earliestrelease_notdomestic_movies2['genres'].value_counts()

Drama                                    263
Drama Romance                            132
Comedy Drama                             110
Comedy Drama Romance                      86
Comedy Romance                            39
                                        ... 
Animation Crime Drama Music Romance        1
Adventure Drama Family                     1
Action Adventure Animation Fantasy         1
Comedy Drama Fantasy Romance Thriller      1
Comedy Family Sci-Fi                       1
Name: genres, Length: 381, dtype: int64

Creating Domestic Distributor Dummy Feature

In [9]:
dd_counts = earliestrelease_notdomestic_movies2['domestic_distributor'].value_counts()
dd_counts

Sony Pictures Classics                 109
Eros International                     108
Strand Releasing                        99
IFC Films                               87
Well Go USA Entertainment               84
                                      ... 
Entertainment One                        1
Walt Disney Studios Motion Pictures      1
International Film Circuit               1
Odeon                                    1
Arc Entertainment                        1
Name: domestic_distributor, Length: 155, dtype: int64

In [10]:
other_dd = list(dd_counts[dd_counts <= 10].index)

In [11]:
earliestrelease_notdomestic_movies2['domestic_distributor'] = earliestrelease_notdomestic_movies2['domestic_distributor'].replace(other_dd, 'Other')

In [12]:
split_and_validate(pd.get_dummies(earliestrelease_notdomestic_movies2['domestic_distributor']), y)


Validation R^2 score was: -0.30929326384313605
Feature coefficient results: 

CJ Entertainment : -1773348070910782464.00
China Lion Film Distribution : -1773348070910823424.00
Cohen Media Group : -1773348070910710016.00
Distrib Films : -1773348070911072000.00
Eros International : -1773348070910324480.00
FIP : -1773348070909558784.00
Film Movement : -1773348070911050240.00
First Run : -1773348070911078912.00
IDP Distribution : -1773348070909699840.00
IFC Films : -1773348070910569472.00
Kino International : -1773348070910961408.00
Kino Lorber : -1773348070911009792.00
Lionsgate : -1773348070909059328.00
Lorber Films : -1773348070911060992.00
Magnolia Pictures : -1773348070910624256.00
Miramax : -1773348070902957312.00
Missing : -1773348070910351104.00
Music Box Films : -1773348070910126336.00
New Yorker Films : -1773348070911039744.00
Oscilloscope : -1773348070910802944.00
Other : -1773348070909676544.00
Palm Pictures : -1773348070910854912.00
Regent Releasing : -1773348070910902528.00
