For this exercises we have to build a processing pipeline that processes the movies dataset. Its not a matter of copy pasting code, but of taking decisions on how to deal with each variable.

### We load the dataset

In [223]:
import pandas as pd
import numpy as np

movies = pd.read_csv("data/movies.1.initial_process.csv")
movies = movies[movies.status=="Released"]
del movies["status"]
movies.head()

Unnamed: 0,belongs_to_collection,budget,genre,original_language,popularity,production_company,production_country,release_date,revenue,runtime,title,vote_average,vote_count
0,Father of the Bride Collection,,Comedy,en,8.387519,Sandollar Productions,United States of America,1995-02-10,76578911.0,106.0,Father of the Bride Part II,5.7,173.0
1,,,Drama,en,0.894647,Miramax,South Africa,1995-12-15,676525.0,106.0,"Cry, the Beloved Country",6.7,13.0
2,Friday Collection,3500000.0,Comedy,en,14.56965,New Line Cinema,United States of America,1995-04-26,28215918.0,91.0,Friday,7.0,513.0
3,,,Comedy,en,8.963037,Paramount Pictures,United States of America,1996-02-01,32.0,87.0,Black Sheep,6.0,124.0
4,,12000000.0,Comedy,en,9.592265,Universal Pictures,United States of America,1996-02-16,41205099.0,92.0,Happy Gilmore,6.5,767.0


### Create a Pipeline that process the dataset. You have to make sure you deal accordingly with numerical, categorical and text variables.

In [224]:
movies = movies[movies.revenue > 0]

In [225]:
target = ["revenue"]
numerical_columns = ["budget", "popularity", "runtime", "vote_average", "vote_count"]
categorical_columns = ["genre", "original_language", "production_country"]
text_columns = ["release_date", "title", "belongs_to_collection", "production_company"]

In [226]:
movies = movies.drop(movies[text_columns], axis=1)

In [227]:
movies.head()

Unnamed: 0,budget,genre,original_language,popularity,production_country,revenue,runtime,vote_average,vote_count
0,,Comedy,en,8.387519,United States of America,76578911.0,106.0,5.7,173.0
1,,Drama,en,0.894647,South Africa,676525.0,106.0,6.7,13.0
2,3500000.0,Comedy,en,14.56965,United States of America,28215918.0,91.0,7.0,513.0
3,,Comedy,en,8.963037,United States of America,32.0,87.0,6.0,124.0
4,12000000.0,Comedy,en,9.592265,United States of America,41205099.0,92.0,6.5,767.0


In [228]:
from sklearn import preprocessing, feature_extraction

imputer = preprocessing.Imputer(strategy="mean")
scaler = preprocessing.StandardScaler()

In [229]:
from sklearn.pipeline import make_pipeline

In [230]:
numerical_pipeline = make_pipeline(
    imputer,
    scaler
)

In [231]:
numerical_pipeline

Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True))])

In [232]:
numerical_pipeline.fit_transform(movies[numerical_columns])

array([[ 0.        ,  0.3814959 ,  0.10139724, -0.3267635 , -0.22703182],
       [ 0.        , -1.03915849,  0.10139724,  0.43720626, -0.53866313],
       [-0.84087746,  1.55363278, -0.63069   ,  0.66639719,  0.43518473],
       ...,
       [ 0.        , -1.12276277, -0.28904929, -1.54911512, -0.54450622],
       [-0.1064169 ,  0.65775164,  0.15020305,  0.28441231,  3.12495252],
       [ 0.        ,  0.41578691, -0.38666092,  0.36080929,  0.04564558]])

### Transform the dataset

In [233]:
from sklearn.base import BaseEstimator

class ColumnSelector(BaseEstimator):
    def __init__(self, cols=None, drop_axis=False):
        self.cols = cols
        self.drop_axis = drop_axis

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

    def transform(self, X, y=None):
        if hasattr(X, 'loc'):
            #only pandas dataframes have the method loc
            t = X.loc[:, self.cols].values
        else:
            # its a numpy array
            t = X[:, self.cols]

        if t.shape[-1] == 1 and self.drop_axis:
            t = t.reshape(-1)
        if len(t.shape) == 1 and not self.drop_axis:
            t = t[:, np.newaxis]
        return t

    def fit(self, X, y=None):
        return self

In [234]:
numerical_col_selector = ColumnSelector(cols=numerical_columns)

In [235]:
numerical_col_selector.fit_transform(movies)

array([[         nan, 8.387519e+00, 1.060000e+02, 5.700000e+00,
        1.730000e+02],
       [         nan, 8.946470e-01, 1.060000e+02, 6.700000e+00,
        1.300000e+01],
       [3.500000e+06, 1.456965e+01, 9.100000e+01, 7.000000e+00,
        5.130000e+02],
       ...,
       [         nan, 4.536980e-01, 9.800000e+01, 4.100000e+00,
        1.000000e+01],
       [1.600000e+07, 9.844558e+00, 1.070000e+02, 6.500000e+00,
        1.894000e+03],
       [         nan, 8.568378e+00, 9.600000e+01, 6.600000e+00,
        3.130000e+02]])

In [236]:
numerical_pipeline = make_pipeline(
    numerical_col_selector,
    imputer,
    scaler
)

In [237]:
numerical_pipeline.fit_transform(movies)[:5]

array([[ 0.        ,  0.3814959 ,  0.10139724, -0.3267635 , -0.22703182],
       [ 0.        , -1.03915849,  0.10139724,  0.43720626, -0.53866313],
       [-0.84087746,  1.55363278, -0.63069   ,  0.66639719,  0.43518473],
       [ 0.        ,  0.49061457, -0.82591326, -0.09757257, -0.32246891],
       [-0.34144428,  0.60991669, -0.58188419,  0.28441231,  0.92989944]])

In [238]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from mlxtend.preprocessing import DenseTransformer

text_pipeline = make_pipeline(
    ColumnSelector(cols=text_columns, drop_axis=True),
    TfidfVectorizer(),
    DenseTransformer()
)

In [212]:
#text_pipeline.fit_transform(movies)
#until further notice, text variables have been cancelled

In [239]:
from category_encoders import OneHotEncoder

In [240]:
categorical_pipeline = make_pipeline(
     ColumnSelector(cols=categorical_columns),
     OneHotEncoder()
)

categorical_pipeline.fit_transform(movies[:5])

Unnamed: 0,0_1,0_2,0_-1,1_1,1_-1,2_1,2_2,2_-1
0,1,0,0,1,0,1,0,0
1,0,1,0,1,0,0,1,0
2,1,0,0,1,0,1,0,0
3,1,0,0,1,0,1,0,0
4,1,0,0,1,0,1,0,0


In [241]:
from sklearn.pipeline import make_union

In [242]:
processing_pipeline = make_union(
    numerical_pipeline,
    categorical_pipeline
)

In [243]:
processing_pipeline

FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(cols=['budget', 'popularity', 'runtime', 'vote_average', 'vote_count'],
        drop_axis=False)), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('standar...e_unknown='impute',
       impute_missing=True, return_df=True, use_cat_names=False, verbose=0))]))],
       transformer_weights=None)

In [244]:
processing_pipeline.fit_transform(movies)

array([[ 0.        ,  0.3814959 ,  0.10139724, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -1.03915849,  0.10139724, ...,  0.        ,
         0.        ,  0.        ],
       [-0.84087746,  1.55363278, -0.63069   , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        , -1.12276277, -0.28904929, ...,  0.        ,
         0.        ,  0.        ],
       [-0.1064169 ,  0.65775164,  0.15020305, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.41578691, -0.38666092, ...,  0.        ,
         0.        ,  0.        ]])

In [245]:
from sklearn.linear_model import LinearRegression

estimator = LinearRegression()
estimator_pipeline = make_pipeline(
    processing_pipeline,
    estimator
)

In [246]:
estimator_pipeline.fit(movies, movies[target])

Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(cols=['budget', 'popularity', 'runtime', 'vote_average', 'vote_count'],
        drop_axis=False)), ('imputer', Imputer(axis=0, copy=True, missing... ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [253]:
estimator_pipeline.predict(movies)

array([[3.52911360e+07],
       [1.81207040e+07],
       [3.96328960e+07],
       ...,
       [2.47070720e+07],
       [1.58007296e+08],
       [4.46627840e+07]])

### Create a Ridge estimator to predict a movies revenue based on the other features. What is the optimal value of alpha to minimize the RMSE? *Hint*: You can use validation curves to figure it out.

In [247]:
from sklearn.model_selection import cross_val_score

In [248]:
cross_val_score(estimator_pipeline, 
                X=movies.drop(target, axis=1), 
                y=movies[target],
                scoring='neg_mean_absolute_error', 
                cv=5
)

array([-3.18953954e+07, -1.65239211e+19, -1.34511897e+17, -1.55019747e+19,
       -6.03019832e+19])

### Remember when we did exploratory data analyses and we groupd the numerical variables into quintiles? That is a valid technique used in Machine Learning to expand a dataset, it is called [Binning or Bucketing](http://blog.yhat.com/tutorials/5-Feature-Engineering.html).

### Create your own transformer that given a numerical variable and a number of buckets returns the specificed quartile (so if we choose buckets = 4, it would return 1, 2,3 or 4 depending on each observation being on the 1st, 2nd, 3rd or 4th quartile).

### Try putting your bucket transformer into a pipeline to make sure it works, and check if it improves the performance of your model.

**Hint**: You can use `ColumnSelector` as a template, and you can check pandas `qcut` for the actual binning.