In [1]:
import warnings
warnings.simplefilter("ignore")

## Escalamiento de los datos

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import train_test_split

X = pd.read_csv('../datasets/X.csv')
y = X['worldwide_gross']
X = X.drop('worldwide_gross',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

In [5]:
scaler.mean_

array([3.30754303e+07, 2.58476835e+05, 4.08569242e+05, 2.95943139e+05,
       1.06190279e+04, 3.51669331e+07, 6.47141001e+00])

In [6]:
scaler.scale_

array([4.12691987e+07, 1.07192805e+07, 1.19691080e+07, 1.18941361e+07,
       1.97832408e+04, 7.49018393e+07, 1.06450867e+00])

In [7]:
X.values

array([[4.25000000e+08, 2.00900000e+03, 1.78000000e+00, ...,
        4.83400000e+03, 2.37000000e+08, 7.90000000e+00],
       [3.06000000e+08, 5.91165594e+08, 5.91165594e+08, ...,
        1.43000000e+02, 5.91165594e+08, 7.10000000e+00],
       [3.00000000e+08, 2.00700000e+03, 2.35000000e+00, ...,
        4.83500000e+04, 3.00000000e+08, 7.10000000e+00],
       ...,
       [7.00000000e+03, 2.00500000e+03, 1.90568571e+03, ...,
        9.30000000e+01, 3.25000000e+03, 7.80000000e+00],
       [3.96700000e+03, 2.01200000e+03, 2.35000000e+00, ...,
        2.38600000e+03, 3.66995625e+03, 6.30000000e+00],
       [1.10000000e+03, 2.00400000e+03, 1.85000000e+00, ...,
        1.63000000e+02, 1.10000000e+03, 6.60000000e+00]])

In [8]:
scaler.transform(X_train)

array([[ 2.6393672 , -0.0239265 , -0.03413512, ..., -0.40847847,
         1.42630766, -0.34890275],
       [-0.55914413, -0.02392678, -0.03413512, ..., -0.46847875,
        -0.33599887,  1.43595823],
       [-0.7529933 , -0.02392668, -0.03413516, ..., -0.47833558,
        -0.46418536,  0.77837787],
       ...,
       [ 0.77356893, -0.02392556, -0.03413512, ...,  0.3686945 ,
         0.39829552, -1.00648312],
       [-0.56156725, -0.02392575, -0.03413512, ..., -0.3913933 ,
        -0.33599887,  0.02685745],
       [-0.51068184, -0.02392584, -0.03413512, ..., -0.51488166,
        -0.30929725,  0.40261766]])

In [None]:
X_train_scaled, X_test_scaled = (scaler.transform(X_train), scaler.transform(X_test))

In [None]:
from sklearn.linear_model import Lasso

model = Lasso()
model_scaled = Lasso()

model.fit(X_train,y_train)
model_scaled.fit(X_train_scaled,y_train)

In [None]:
print(model.score(X_test,y_test))
print(model_scaled.score(X_test_scaled,y_test))

Los modelos de regresión no se ven afectados por el escalamiento de las features.

## Simplificar las transformaciones con pipelines

In [None]:
from sklearn.pipeline import make_pipeline

model_scaled = make_pipeline(StandardScaler(),
                            Lasso())

model_scaled.fit(X_train,y_train)

In [None]:
print(model_scaled.score(X_test,y_test))

## Crear nuevas features de forma automática

In [None]:
A = np.arange(6).reshape(3, 2)
A

In [None]:
from sklearn.preprocessing import PolynomialFeatures

transformer = PolynomialFeatures(2)
transformer.fit_transform(A)

In [None]:
X.shape

In [None]:
transformer = PolynomialFeatures(2)
transformer.fit_transform(X).shape

In [25]:
model_poly = make_pipeline(PolynomialFeatures(2),
                          Lasso())
model_poly.fit(X_train,y_train)
model_poly.score(X_test,y_test)



-7401097975.623044

In [27]:
model = Lasso()
model.fit(X_train,y_train)
model.score(X_test,y_test)



0.57593747144784135

## Crear features categóricas

In [31]:
movies_obj = pd.read_csv('../vol/intermediate_results/movies_obj.csv')

In [33]:
movies_obj.apply(pd.Series.nunique).sort_values()

color                2
content_rating      18
language            47
country             65
genres             914
actor_1_name      2097
director_name     2398
actor_2_name      3032
actor_3_name      3521
plot_keywords     4760
movie_title       4917
dtype: int64

## Encoding Binario

In [34]:
!pip install category_encoders



In [35]:
categoricals = pd.read_csv('../vol/intermediate_results/categoricals.csv').set_index('Unnamed: 0')

In [36]:
categoricals.head(2)

Unnamed: 0_level_0,actor_1_name,director_name
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,CCH Pounder,James Cameron
1,Doug Walker,Doug Walker


In [37]:
categoricals = categoricals.reset_index(drop=True).fillna(0)

In [38]:
X_binenc = pd.concat([X,categoricals],axis=1)

In [39]:
X_binenc.head()

Unnamed: 0,production_budget,title_year,aspect_ratio,duration.1,cast_total_facebook_likes,budget,imdb_score,actor_1_name,director_name
0,425000000.0,2009.0,1.78,178.0,4834.0,237000000.0,7.9,CCH Pounder,James Cameron
1,306000000.0,591165600.0,591165600.0,591165600.0,143.0,591165600.0,7.1,Doug Walker,Doug Walker
2,300000000.0,2007.0,2.35,169.0,48350.0,300000000.0,7.1,Johnny Depp,Gore Verbinski
3,300000000.0,2015.0,2.35,148.0,11700.0,245000000.0,6.8,Christoph Waltz,Sam Mendes
4,275000000.0,2012.0,2.35,164.0,106759.0,250000000.0,8.5,Tom Hardy,Christopher Nolan


In [40]:
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['actor_1_name','director_name'])

In [42]:
encoder.fit_transform(X_binenc).shape

(4104, 29)

In [43]:
X_binenc = encoder.fit_transform(X_binenc)

In [44]:
Xb_train, Xb_test, y_train, y_test = train_test_split(X_binenc,y)

In [45]:
X_train, X_test = (Xb_train[X.columns],Xb_test[X.columns])

In [46]:
model_binenc = Lasso()
model = Lasso()

In [47]:
model_binenc.fit(Xb_train,y_train)
model.fit(X_train,y_train)



Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [48]:
print(model_binenc.score(Xb_test,y_test))
print(model.score(X_test,y_test))

0.623169426585
0.624091664736


Aumentamos el rendimiento de nuestro algoritmo pero no de forma significativa. Mantengamos entonces la dimensionalidad de nuestro espacio de features baja, y vamos a buscar modelos más complejos.

## Más datos de calidad

Contamos con la base de datos de ganancias de las péliculas el primer fin de semana de exhibición, así como la cantidad de cines en la que fue estrenada.

In [49]:
pd.read_csv('../vol/datasets/opening_df.csv').head()

Unnamed: 0.1,Unnamed: 0,movie_title,opening_gross,screens
0,0,10 Days in a Madhouse,2451.0,10.0
1,1,10 Things I Hate About You,8330681.0,2271.0
2,2,102 Dalmatians,19883351.0,2704.0
3,3,12 Rounds,5329240.0,2331.0
4,4,12 Years a Slave,923715.0,19.0
