In [1]:
import numpy as np
import pandas as pd
import csv
import sklearn as skl
import tensorflow as tf

In [2]:
# Read in the demonstration data and keep only a few variables
input_df=pd.read_csv("Resources/segment1_input.csv", low_memory=False)
print(input_df.describe())
print(input_df.columns)


             budget             id    popularity       revenue       runtime  \
count  1.290600e+04   12906.000000  12906.000000  1.290600e+04  12902.000000   
mean   1.254855e+07  107530.588951      5.411736  3.309889e+07     96.528058   
std    3.020407e+07  123459.390616      9.775813  1.138658e+08     31.869326   
min    0.000000e+00       5.000000      0.000000  0.000000e+00      0.000000   
25%    0.000000e+00   14913.250000      1.013951  0.000000e+00     87.000000   
50%    0.000000e+00   45188.500000      3.372783  0.000000e+00     95.000000   
75%    1.000000e+07  173995.000000      7.888230  5.376946e+06    105.000000   
max    3.800000e+08  464819.000000    547.488298  2.787965e+09    877.000000   

       vote_average    vote_count  release_year  
count  12906.000000  12906.000000  12906.000000  
mean       5.639648    291.211917   2006.070820  
std        1.558221    850.228648      7.619696  
min        0.000000      0.000000   1990.000000  
25%        5.000000      8.00

In [3]:
# select columns to keep for model
df=input_df[["belongs_to_collection", "homepage", "runtime", "vote_average", "release_year"]]
df.dtypes

belongs_to_collection     object
homepage                  object
runtime                  float64
vote_average             float64
release_year             float64
dtype: object

In [4]:
df.head(20)

Unnamed: 0,belongs_to_collection,homepage,runtime,vote_average,release_year
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",http://toystory.disney.com/toy-story,81.0,7.7,1995.0
1,,,104.0,6.9,1995.0
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",,101.0,6.5,1995.0
3,,,127.0,6.1,1995.0
4,"{'id': 96871, 'name': 'Father of the Bride Col...",,106.0,5.7,1995.0
5,,,170.0,7.7,1995.0
6,,,127.0,6.2,1995.0
7,,,97.0,5.4,1995.0
8,,,106.0,5.5,1995.0
9,"{'id': 645, 'name': 'James Bond Collection', '...",http://www.mgm.com/view/movie/757/Goldeneye/,130.0,6.6,1995.0


In [5]:
# convert belongs_to_collection, homepage and success to binary(0,1)
print(df["belongs_to_collection"].notnull().sum())
collection=df["belongs_to_collection"].notnull().replace([True, False], [1, 0])
collection.name="collection"
print(collection.value_counts())
print(df["homepage"].notnull().sum())
website=df["homepage"].notnull().replace([True, False], [1, 0])
website.name="website"
print(website.value_counts())
success=df["vote_average"].apply(lambda z: 1 if z > 5 else 0)
success.name="success"
print(success.value_counts())
clean_df = pd.concat([df, collection, website, success], axis=1, ignore_index=False, join="inner")
clean_df.drop(["belongs_to_collection", "homepage", "vote_average"], axis=1, inplace=True)
print(clean_df.columns)

1797
0    11109
1     1797
Name: collection, dtype: int64
3928
0    8978
1    3928
Name: website, dtype: int64
1    9563
0    3343
Name: success, dtype: int64
Index(['runtime', 'release_year', 'collection', 'website', 'success'], dtype='object')


In [6]:
from sklearn.preprocessing import OneHotEncoder
# Create a OneHotEncoder instance
enc =OneHotEncoder(sparse=False)
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df =pd.DataFrame(enc.fit_transform(clean_df[["collection", "website"]]))
# Add the encoded variable names to the DataFrame
encode_df.columns =enc.get_feature_names(["collection", "website"])
encode_df.head()

Unnamed: 0,collection_0,collection_1,website_0,website_1
0,0.0,1.0,0.0,1.0
1,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0
3,1.0,0.0,1.0,0.0
4,0.0,1.0,1.0,0.0


In [7]:
# Merge one-hot encoded features and drop the originals
clean_df =clean_df.merge(encode_df,left_index=True,right_index=True)
clean_df.head()

Unnamed: 0,runtime,release_year,collection,website,success,collection_0,collection_1,website_0,website_1
0,81.0,1995.0,1,1,1,0.0,1.0,0.0,1.0
1,104.0,1995.0,0,0,1,1.0,0.0,1.0,0.0
2,101.0,1995.0,1,0,1,0.0,1.0,1.0,0.0
3,127.0,1995.0,0,0,1,1.0,0.0,1.0,0.0
4,106.0,1995.0,1,0,1,0.0,1.0,1.0,0.0


In [25]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
# Split our preprocessed data into our features and target arrays
y =clean_df["success"].values
X =clean_df.drop(["success"],1).values
# Split the preprocessed data into a training and testing dataset
X_train,X_test,y_train,y_test =train_test_split(X,y,random_state=78)

In [26]:
# Use sklearn to split dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =train_test_split(X,y,random_state=78)



In [27]:
from sklearn.preprocessing import StandardScaler
# Create scaler instance
X_scaler =skl.preprocessing.StandardScaler()
# Fit the scaler
X_scaler.fit(X_train)
# Scale the data
X_train_scaled =pd.DataFrame(X_scaler.transform(X_train))
X_test_scaled =pd.DataFrame(X_scaler.transform(X_test))
print(X_train_scaled.head())
print(X_test_scaled.head())
print(type(X_train_scaled))


          0         1         2         3         4         5         6  \
0 -0.934456 -1.712505  2.493541 -0.659859 -2.493541  2.493541  0.659859   
1  0.078325 -1.712505 -0.401036 -0.659859  0.401036 -0.401036  0.659859   
2 -0.143221  0.121920 -0.401036 -0.659859  0.401036 -0.401036  0.659859   
3 -0.269818 -1.974566 -0.401036 -0.659859  0.401036 -0.401036  0.659859   
4 -0.301468  0.515011 -0.401036  1.515474  0.401036 -0.401036 -1.515474   

          7  
0 -0.659859  
1 -0.659859  
2 -0.659859  
3 -0.659859  
4  1.515474  
          0         1         2         3         4         5         6  \
0  0.109975  0.252950 -0.401036  1.515474  0.401036 -0.401036 -1.515474   
1 -0.111571  1.432223 -0.401036 -0.659859  0.401036 -0.401036  0.659859   
2  0.679665 -0.140141 -0.401036 -0.659859  0.401036 -0.401036  0.659859   
3 -0.206519  0.515011  2.493541 -0.659859 -2.493541  2.493541  0.659859   
4  0.015027  0.121920 -0.401036 -0.659859  0.401036 -0.401036  0.659859   

          7  


In [40]:
# Create the Keras Sequential model
number_input_features =len(X_train_scaled[0])
hidden_nodes_layer1 = 20
nn_model =tf.keras.models.Sequential()

In [41]:
# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,activation="relu",input_dim=number_input_features))
# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1,activation="sigmoid"))

In [42]:
# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 20)                193600    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 21        
Total params: 193,621
Trainable params: 193,621
Non-trainable params: 0
_________________________________________________________________


In [43]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [44]:
# Fit the model to the training data
y_train_df=pd.DataFrame(y_train)
fit_model =nn_model.fit(X_train_scaled,y_train_df,epochs=10)

Epoch 1/10


ValueError: in user code:

    C:\Users\kathy\anaconda3\envs\py37nbext\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    C:\Users\kathy\anaconda3\envs\py37nbext\lib\site-packages\tensorflow\python\keras\engine\training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\kathy\anaconda3\envs\py37nbext\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\kathy\anaconda3\envs\py37nbext\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\kathy\anaconda3\envs\py37nbext\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\kathy\anaconda3\envs\py37nbext\lib\site-packages\tensorflow\python\keras\engine\training.py:789 run_step  **
        outputs = model.train_step(data)
    C:\Users\kathy\anaconda3\envs\py37nbext\lib\site-packages\tensorflow\python\keras\engine\training.py:747 train_step
        y_pred = self(x, training=True)
    C:\Users\kathy\anaconda3\envs\py37nbext\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:976 __call__
        self.name)
    C:\Users\kathy\anaconda3\envs\py37nbext\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:216 assert_input_compatibility
        ' but received input with shape ' + str(shape))

    ValueError: Input 0 of layer sequential_3 is incompatible with the layer: expected axis -1 of input shape to have value 9679 but received input with shape [None, 8]
