For this exercises we have to build a processing pipeline that processes the movies dataset. Its not a matter of copy pasting code, but of taking decisions on how to deal with each variable.

### We load the dataset

In [3]:
import pandas as pd
import numpy as np

movies = pd.read_csv("movies.1.initial_process.csv")
movies = movies[movies.status=="Released"]
del movies["status"]
del movies["release_date"]
movies.head()

Unnamed: 0,belongs_to_collection,budget,genre,original_language,popularity,production_company,production_country,revenue,runtime,title,vote_average,vote_count
0,Father of the Bride Collection,,Comedy,en,8.387519,Sandollar Productions,United States of America,76578911.0,106.0,Father of the Bride Part II,5.7,173.0
1,,,Drama,en,0.894647,Miramax,South Africa,676525.0,106.0,"Cry, the Beloved Country",6.7,13.0
2,Friday Collection,3500000.0,Comedy,en,14.56965,New Line Cinema,United States of America,28215918.0,91.0,Friday,7.0,513.0
3,,,Comedy,en,8.963037,Paramount Pictures,United States of America,32.0,87.0,Black Sheep,6.0,124.0
4,,12000000.0,Comedy,en,9.592265,Universal Pictures,United States of America,41205099.0,92.0,Happy Gilmore,6.5,767.0


In [4]:
null_counts = movies.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

belongs_to_collection    1155
budget                    580
production_company        200
revenue                   137
production_country         98
genre                      46
runtime                     7
dtype: int64

In [5]:
movies = movies.dropna(axis = 0, subset =['revenue'])

In [6]:
null_counts = movies.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

belongs_to_collection    1024
budget                    452
production_company        136
production_country         63
genre                      22
runtime                     6
dtype: int64

### Create a Pipeline that process the dataset. You have to make sure you deal accordingly with numerical, categorical and text variables.

In [7]:
from sklearn import preprocessing, feature_extraction

imputer = preprocessing.Imputer(strategy="mean")
scaler = preprocessing.StandardScaler()

In [8]:
from sklearn.pipeline import make_pipeline

In [9]:
target_variable = "revenue"
numerical_cols =  movies.drop(columns=target_variable).select_dtypes(np.number).columns
categorical_col = ['belongs_to_collection', 'genre', 'original_language', 'production_company', 'production_country', 'title']

In [10]:
from sklearn.base import BaseEstimator

class ColumnSelector(BaseEstimator):
    def __init__(self, cols=None, drop_axis=False):
        self.cols = cols
        self.drop_axis = drop_axis

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

    def transform(self, X, y=None):
        if hasattr(X, 'loc'):
            #only pandas dataframes have the method loc
            t = X.loc[:, self.cols].values
        else:
            # its a numpy array
            t = X[:, self.cols]

        if t.shape[-1] == 1 and self.drop_axis:
            t = t.reshape(-1)
        if len(t.shape) == 1 and not self.drop_axis:
            t = t[:, np.newaxis]
        return t

    def fit(self, X, y=None):
        return self

In [11]:
numerical_col_selector = ColumnSelector(cols=numerical_cols)

In [12]:
numerical_pipeline = make_pipeline(
    numerical_col_selector,
    imputer,
    scaler
)

In [13]:
numerical_pipeline.fit_transform(movies)[:5]

array([[ 0.        ,  0.3814959 ,  0.10139724, -0.3267635 , -0.22703182],
       [ 0.        , -1.03915849,  0.10139724,  0.43720626, -0.53866313],
       [-0.84087746,  1.55363278, -0.63069   ,  0.66639719,  0.43518473],
       [ 0.        ,  0.49061457, -0.82591326, -0.09757257, -0.32246891],
       [-0.34144428,  0.60991669, -0.58188419,  0.28441231,  0.92989944]])

In [14]:
from category_encoders import OneHotEncoder

In [15]:
categorical_pipeline = make_pipeline(
     ColumnSelector(cols=categorical_col),
     OneHotEncoder()
)

In [16]:
from sklearn.pipeline import make_union

In [17]:
processing_pipeline = make_union(
    numerical_pipeline,
    categorical_pipeline,
)

### Transform the dataset

In [18]:
imputed = processing_pipeline.fit_transform(movies)
imputed

array([[ 0.        ,  0.3814959 ,  0.10139724, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -1.03915849,  0.10139724, ...,  0.        ,
         0.        ,  0.        ],
       [-0.84087746,  1.55363278, -0.63069   , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        , -1.12276277, -0.28904929, ...,  0.        ,
         0.        ,  0.        ],
       [-0.1064169 ,  0.65775164,  0.15020305, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.41578691, -0.38666092, ...,  0.        ,
         0.        ,  0.        ]])

### Create a Ridge estimator to predict a movies revenue based on the other features. What is the optimal value of alpha to minimize the RMSE? *Hint*: You can use validation curves to figure it out.

In [20]:
from sklearn.linear_model import LinearRegression

estimator = LinearRegression()
estimator_pipeline = make_pipeline(
    processing_pipeline,
    estimator
)

In [21]:
estimator_pipeline.fit(movies, movies[target_variable])

Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(cols=Index(['budget', 'popularity', 'runtime', 'vote_average', 'vote_count'], dtype='object'),
        drop_axis=False)), ('imputer', Imputer(axi... ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [23]:
estimator_pipeline.predict(movies)[:5]


array([7.65789110e+07, 6.76525000e+05, 2.82159180e+07, 3.20000001e+01,
       4.12050990e+07])

In [21]:
from sklearn.model_selection import cross_val_score

In [23]:
cross_val_score(estimator_pipeline, 
                X=movies.drop(target_variable, axis=1), 
                y=movies[target_variable],
                scoring='neg_mean_absolute_error', 
                cv=3
).mean()

-25136543.756815344

alpha value 

In [1]:
from sklearn.model_selection import learning_curve

In [25]:
imputed_df = pd.DataFrame(imputed,
                         index = movies.index,
                         columns = movies.columns)
imputed_df.head(10)

ValueError: Shape of passed values is (1966, 1205), indices imply (12, 1205)

In [None]:
train_sizes, train_scores, test_scores = learning_curve(
        estimator_ols,
        X,
        y,
        cv=5,
        n_jobs=-1, 
        scoring="neg_mean_squared_error",
        train_sizes=np.linspace(0.01, 1., 10))

### Remember when we did exploratory data analyses and we groupd the numerical variables into quintiles? That is a valid technique used in Machine Learning to expand a dataset, it is called [Binning or Bucketing](http://blog.yhat.com/tutorials/5-Feature-Engineering.html).

### Create your own transformer that given a numerical variable and a number of buckets returns the specificed quartile (so if we choose buckets = 4, it would return 1, 2,3 or 4 depending on each observation being on the 1st, 2nd, 3rd or 4th quartile).

### Try putting your bucket transformer into a pipeline to make sure it works, and check if it improves the performance of your model.

**Hint**: You can use `ColumnSelector` as a template, and you can check pandas `qcut` for the actual binning.