In [7]:
!pip install sklearn-pandas

Collecting sklearn-pandas
  Downloading https://files.pythonhosted.org/packages/1f/48/4e1461d828baf41d609efaa720d20090ac6ec346b5daad3c88e243e2207e/sklearn_pandas-1.8.0-py2.py3-none-any.whl
Collecting numpy>=1.6.1 (from sklearn-pandas)
  Using cached https://files.pythonhosted.org/packages/1f/c7/198496417c9c2f6226616cff7dedf2115a4f4d0276613bab842ec8ac1e23/numpy-1.16.4-cp27-cp27mu-manylinux1_x86_64.whl
Collecting scikit-learn>=0.15.0 (from sklearn-pandas)
  Using cached https://files.pythonhosted.org/packages/f7/bb/52a01390c1dbb2c65d3072bc687271aa9ddf6964141ce7e03304820138f4/scikit_learn-0.20.3-cp27-cp27mu-manylinux1_x86_64.whl
Collecting pandas>=0.11.0 (from sklearn-pandas)
  Using cached https://files.pythonhosted.org/packages/db/83/7d4008ffc2988066ff37f6a0bb6d7b60822367dcb36ba5e39aa7801fda54/pandas-0.24.2-cp27-cp27mu-manylinux1_x86_64.whl
Collecting scipy>=0.14 (from sklearn-pandas)
  Using cached https://files.pythonhosted.org/packages/1d/f6/7c16d60aeb3694e5611976cb4f1eaf1c6b7f1e7c55

In [13]:
import pandas
import sklearn
from sklearn.linear_model import LinearRegression

from sklearn_pandas import DataFrameMapper
import sklearn

from sklearn.metrics import mean_squared_error

import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.externals import joblib
from io import StringIO

# Just an example CSV for demonstration
my_data = u"""film_id,year,length,budget,votes,genre,rating
1,1971,121,11.6,348,Comedy,5.2
2,1939,71,13.4,20,Comedy,7.9
3,1941,7,13.7,5,Animation,8.1
4,1996,70,14.8,6,Comedy,7.8
5,1975,71,13.5,17,Action,4.6
"""
my_csv = StringIO(my_data)

# loaded as pandas dataframe
movies = pandas.read_csv(my_csv)
movies

Unnamed: 0,film_id,year,length,budget,votes,genre,rating
0,1,1971,121,11.6,348,Comedy,5.2
1,2,1939,71,13.4,20,Comedy,7.9
2,3,1941,7,13.7,5,Animation,8.1
3,4,1996,70,14.8,6,Comedy,7.8
4,5,1975,71,13.5,17,Action,4.6


In [17]:
from sklearn.preprocessing import FunctionTransformer

def star_year_from_min(df):
    # simple example, but any dataprep can be done here
    new_df = df.copy()
    min_year = min(new_df["year"])
    new_df["year"] = new_df["year"].map(lambda year: year - min_year)
    return new_df
    
dataprep_transformer = FunctionTransformer(star_year_from_min, validate=False)

mapper = DataFrameMapper([
    ('year', None),
    ('genre', sklearn.preprocessing.LabelBinarizer())
])

pipe = sklearn.pipeline.Pipeline([
    ('dataprep', dataprep_transformer),   
    ('featurize', mapper),
    ('elastic_net', LinearRegression())
])
model = pipe.fit(X=movies, y=movies.rating.values)


In [34]:
zip(
    model.named_steps['featurize'].transformed_names_, 
    model.named_steps['elastic_net'].coef_
)

[('year', -0.005532870559412022),
 ('genre_Action', -1.8811691847012386),
 ('genre_Animation', 1.4307132162787515),
 ('genre_Comedy', 0.4504559684224858)]

In [18]:
joblib.dump(model, 'movies_model_v1.pkl')


['movies_model_v1.pkl']

In [21]:
## Example scoring of new data

year = 2010
genre = "Comedy"
from pandas import DataFrame

test_data = DataFrame.from_dict({
    'year': [year],
    'genre': [genre]
})
test_data

Unnamed: 0,genre,year
0,Comedy,2010


In [22]:
model_v1 = joblib.load('movies_model_v1.pkl')
print(model_v1.predict(test_data))

[7.13080849]


In [35]:
zip(
    model_v1.named_steps['featurize'].transformed_names_, 
    model.named_steps['elastic_net'].coef_
)

[('year', -0.005532870559412022),
 ('genre_Action', -1.8811691847012386),
 ('genre_Animation', 1.4307132162787515),
 ('genre_Comedy', 0.4504559684224858)]