For this exercises we have to build a processing pipeline that processes the movies dataset. Its not a matter of copy pasting code, but of taking decisions on how to deal with each variable.

### We load the dataset

In [1]:
import pandas as pd
import numpy as np

movies = pd.read_csv("data/movies.1.initial_process.csv")
movies = movies[movies.status=="Released"]
del movies["status"]
movies.head()

Unnamed: 0,belongs_to_collection,budget,genre,original_language,popularity,production_company,production_country,release_date,revenue,runtime,title,vote_average,vote_count
0,Father of the Bride Collection,,Comedy,en,8.387519,Sandollar Productions,United States of America,1995-02-10,76578911.0,106.0,Father of the Bride Part II,5.7,173.0
1,,,Drama,en,0.894647,Miramax,South Africa,1995-12-15,676525.0,106.0,"Cry, the Beloved Country",6.7,13.0
2,Friday Collection,3500000.0,Comedy,en,14.56965,New Line Cinema,United States of America,1995-04-26,28215918.0,91.0,Friday,7.0,513.0
3,,,Comedy,en,8.963037,Paramount Pictures,United States of America,1996-02-01,32.0,87.0,Black Sheep,6.0,124.0
4,,12000000.0,Comedy,en,9.592265,Universal Pictures,United States of America,1996-02-16,41205099.0,92.0,Happy Gilmore,6.5,767.0


In [43]:
len(movies)

1342

### Create a Pipeline that process the dataset. You have to make sure you deal accordingly with numerical, categorical and text variables.

In [2]:
numerical_df = movies.select_dtypes(np.number)
numerical_df.head()

Unnamed: 0,budget,popularity,revenue,runtime,vote_average,vote_count
0,,8.387519,76578911.0,106.0,5.7,173.0
1,,0.894647,676525.0,106.0,6.7,13.0
2,3500000.0,14.56965,28215918.0,91.0,7.0,513.0
3,,8.963037,32.0,87.0,6.0,124.0
4,12000000.0,9.592265,41205099.0,92.0,6.5,767.0


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [47]:
target_variable = 'revenue'
numerical_cols =  movies.drop(columns=target_variable).select_dtypes(np.number).columns
categorical_col = ['genre','original_language','production_company','production_country']
ordinal_col = ["ordinal_col"]
text_col = ['title']

# Note to self:

Can do fillnan in pipeline

In [57]:
for num_col in numerical_cols:
    movies[[num_col]] = movies[[num_col]].fillna(
        movies[movies[num_col].notna()][num_col].median()
    )
movies[['revenue']] = movies[['revenue']].fillna(
        movies[movies['revenue'].notna()]['revenue'].median()
    )
for cate_col in categorical_col:
    movies[[cate_col]] = movies[[cate_col]].fillna(
        'Other'
    )  

In [6]:
from sklearn import preprocessing, feature_extraction

imputer = preprocessing.Imputer(strategy="mean")
scaler = preprocessing.StandardScaler()

In [7]:
from sklearn.pipeline import make_pipeline

In [8]:
numerical_pipeline = make_pipeline(
    imputer,
    scaler
)

In [9]:
from sklearn.base import BaseEstimator

class ColumnSelector(BaseEstimator):
    def __init__(self, cols=None, drop_axis=False):
        self.cols = cols
        self.drop_axis = drop_axis

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

    def transform(self, X, y=None):
        if hasattr(X, 'loc'):
            #only pandas dataframes have the method loc
            t = X.loc[:, self.cols].values
        else:
            # its a numpy array
            t = X[:, self.cols]

        if t.shape[-1] == 1 and self.drop_axis:
            t = t.reshape(-1)
        if len(t.shape) == 1 and not self.drop_axis:
            t = t[:, np.newaxis]
        return t

    def fit(self, X, y=None):
        return self

In [10]:
numerical_col_selector = ColumnSelector(cols=numerical_cols)

In [11]:
numerical_col_selector.fit_transform(movies)

array([[1.040000e+07, 8.387519e+00, 1.060000e+02, 5.700000e+00,
        1.730000e+02],
       [1.040000e+07, 8.946470e-01, 1.060000e+02, 6.700000e+00,
        1.300000e+01],
       [3.500000e+06, 1.456965e+01, 9.100000e+01, 7.000000e+00,
        5.130000e+02],
       ...,
       [1.040000e+07, 5.605020e-01, 0.000000e+00, 3.700000e+00,
        6.000000e+00],
       [1.040000e+07, 6.742610e-01, 1.140000e+02, 5.500000e+00,
        8.000000e+00],
       [1.040000e+07, 3.800000e-05, 6.500000e+01, 0.000000e+00,
        0.000000e+00]])

In [12]:
numerical_pipeline = make_pipeline(
    numerical_col_selector,
    imputer,
    scaler
)

In [13]:
numerical_pipeline.fit_transform(movies)[:5]

array([[-0.24803204,  0.48543258,  0.18242354, -0.20276605, -0.17842234],
       [-0.24803204, -0.93272362,  0.18242354,  0.44775282, -0.50242877],
       [-0.66445288,  1.65550827, -0.46971388,  0.64290848,  0.51009133],
       [-0.24803204,  0.59435936, -0.64361719, -0.00761039, -0.27764931],
       [-0.15147068,  0.71345169, -0.42623805,  0.31764904,  1.02445154]])

Trying to make 'Other' category for countries here

Jk going to do it in the transformer

In [28]:
countries = movies.production_country.unique()
for country in countries:
    if movies.production_country.value_count()[country] < 8:
        movies.replace()

928

In [28]:
movies['genre'].unique()

array(['Comedy', 'Drama', 'Documentary', 'Science Fiction', 'Thriller',
       'Animation', 'Adventure', 'Horror', 'Fantasy', 'War', 'Western',
       'Music', 'Romance', 'Action', 'Mystery', nan, 'Crime', 'Family'],
      dtype=object)

In [16]:
movies.original_language.value_counts()

en    1122
fr      52
es      25
ru      24
it      23
hi      16
de      15
ja      12
ko       6
sv       5
pt       5
nl       4
fi       3
bn       3
fa       3
zh       3
cn       3
ta       2
tr       2
he       1
sq       1
hu       1
te       1
ro       1
sr       1
da       1
pl       1
mr       1
bs       1
el       1
vi       1
tl       1
th       1
Name: original_language, dtype: int64

In [19]:
movies.production_company.value_counts()

Paramount Pictures                        68
Universal Pictures                        54
Twentieth Century Fox Film Corporation    35
New Line Cinema                           34
Columbia Pictures                         33
Warner Bros.                              26
United Artists                            22
TriStar Pictures                          15
Columbia Pictures Corporation             15
Touchstone Pictures                       14
Fox Searchlight Pictures                  13
Walt Disney Pictures                      11
Miramax Films                             11
Imagine Entertainment                     11
Metro-Goldwyn-Mayer (MGM)                  9
BBC Films                                  9
Hollywood Pictures                         9
DreamWorks SKG                             9
Lions Gate Films                           7
Dimension Films                            7
The Weinstein Company                      6
Killer Films                               5
Orion Pict

# Todo:

make categorical transformer

In [41]:
class CategoryOther(BaseEstimator):
    def __init__(self, cols=None, drop_axis=False):
        self.cols = cols
        self.drop_axis = drop_axis

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)
    
    # X is the dataset here
    def transform(self, X, y=None):
        for col in self.cols:
            if col == 'production_company':
                for company in X[col].unique():
                    if X[col].value_counts()[company] < 20:
                        X.replace(company, 'Other')
            elif col == 'original_language':
                for language in X[col].unique():
                    if X[col].value_counts()[language] < 20:
                        X.replace(language, 'Other')
            elif col == 'genre':
                for genre in X[col].unique():
                    if X[col].value_counts()[genre] < 20:
                        X.replace(genre, 'Other')
            elif col == 'production_country':
                for country in X[col].unique():
                    if X[col].value_counts()[country] < 20:
                        X.replace(country, 'Other')
                        #print(X.production_country.value_counts()['Other'])
        
        if hasattr(X, 'loc'):
            #only pandas dataframes have the method loc
            t = X.loc[:, self.cols].values
        else:
            # its a numpy array
            t = X[:, self.cols]

        if t.shape[-1] == 1 and self.drop_axis:
            t = t.reshape(-1)
        if len(t.shape) == 1 and not self.drop_axis:
            t = t[:, np.newaxis]
        
        return t

    def fit(self, X, y=None):
        return self

In [32]:
from category_encoders import OneHotEncoder

In [42]:
categorical_pipeline = make_pipeline(
     CategoryOther(cols=categorical_col),
     OneHotEncoder()
)

categorical_pipeline.fit_transform(movies)[:5]

Unnamed: 0,0_1,0_2,0_3,0_4,0_5,0_6,0_7,0_8,0_9,0_10,...,3_34,3_35,3_36,3_37,3_38,3_39,3_40,3_41,3_42,3_-1
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from mlxtend.preprocessing import DenseTransformer

text_pipeline = make_pipeline(
    ColumnSelector(cols=text_col, drop_axis=True),
    TfidfVectorizer(),
    DenseTransformer()
)

In [50]:
text_pipeline.fit_transform(movies)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [52]:
from sklearn.pipeline import make_union

In [53]:
processing_pipeline = make_union(
    numerical_pipeline,
    categorical_pipeline,
    text_pipeline
)

### Transform the dataset

In [54]:
processing_pipeline.fit_transform(movies)

array([[-0.24803204,  0.48543258,  0.18242354, ...,  0.        ,
         0.        ,  0.        ],
       [-0.24803204, -0.93272362,  0.18242354, ...,  0.        ,
         0.        ,  0.        ],
       [-0.66445288,  1.65550827, -0.46971388, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.24803204, -0.99596636, -4.42601422, ...,  0.        ,
         0.        ,  0.        ],
       [-0.24803204, -0.9744355 ,  0.53023016, ...,  0.        ,
         0.        ,  0.        ],
       [-0.24803204, -1.10204391, -1.60008541, ...,  0.        ,
         0.        ,  0.        ]])

In [55]:
from sklearn.linear_model import LinearRegression

estimator = LinearRegression()
estimator_pipeline = make_pipeline(
    processing_pipeline,
    estimator
)

In [58]:
estimator_pipeline.fit(movies, movies[target_variable])

Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(cols=Index(['budget', 'popularity', 'runtime', 'vote_average', 'vote_count'], dtype='object'),
        drop_axis=False)), ('imputer', Imputer(axi... ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [59]:
estimator_pipeline.predict(movies)[:5]

array([76462080.,   929792., 28033024.,   378880., 41041920.])

### Create a Ridge estimator to predict a movies revenue based on the other features. What is the optimal value of alpha to minimize the RMSE? *Hint*: You can use validation curves to figure it out.

In [61]:
from sklearn.linear_model import RidgeCV

In [65]:
alphas=[0.1,1,5,10,50,100]

In [66]:
X_train = movies.drop(columns=['revenue','belongs_to_collection','release_date'])
y_train = movies['revenue']

In [70]:
import numpy as np
from sklearn import metrics
rmserrors = []
for a in alphas:
    ridge = Ridge(alpha=a, tol=0.01, max_iter=5000)
    ridge_pipeline = make_pipeline(
    processing_pipeline,
    ridge
    )
    ridge_pipeline.fit(movies, movies[target_variable])
    y_pred = ridge_pipeline.predict(movies)
    rmse = np.sqrt(metrics.mean_squared_error(y_train, y_pred))
    rmserrors.append(rmse)
rmserrors

[5926885.090583885,
 18243760.83673205,
 29799796.500147942,
 33216236.33269177,
 37393250.21385112,
 38250631.25158873]

### Remember when we did exploratory data analyses and we groupd the numerical variables into quintiles? That is a valid technique used in Machine Learning to expand a dataset, it is called [Binning or Bucketing](http://blog.yhat.com/tutorials/5-Feature-Engineering.html).

### Create your own transformer that given a numerical variable and a number of buckets returns the specificed quartile (so if we choose buckets = 4, it would return 1, 2,3 or 4 depending on each observation being on the 1st, 2nd, 3rd or 4th quartile).

### Try putting your bucket transformer into a pipeline to make sure it works, and check if it improves the performance of your model.

**Hint**: You can use `ColumnSelector` as a template, and you can check pandas `qcut` for the actual binning.