In [1]:
import numpy as np
import pandas as pd
import csv
import sklearn as skl
import tensorflow as tf

In [2]:
# Read in the demonstration data and keep only a few variables
input_df=pd.read_csv("Resources/segment1_input.csv", low_memory=False)
print(input_df.describe())
print(input_df.columns)


             budget             id    popularity       revenue       runtime  \
count  1.290600e+04   12906.000000  12906.000000  1.290600e+04  12902.000000   
mean   1.254855e+07  107530.588951      5.411736  3.309889e+07     96.528058   
std    3.020407e+07  123459.390616      9.775813  1.138658e+08     31.869326   
min    0.000000e+00       5.000000      0.000000  0.000000e+00      0.000000   
25%    0.000000e+00   14913.250000      1.013951  0.000000e+00     87.000000   
50%    0.000000e+00   45188.500000      3.372783  0.000000e+00     95.000000   
75%    1.000000e+07  173995.000000      7.888230  5.376946e+06    105.000000   
max    3.800000e+08  464819.000000    547.488298  2.787965e+09    877.000000   

       vote_average    vote_count  release_year  
count  12906.000000  12906.000000  12906.000000  
mean       5.639648    291.211917   2006.070820  
std        1.558221    850.228648      7.619696  
min        0.000000      0.000000   1990.000000  
25%        5.000000      8.00

In [3]:
# select columns to keep for model
df=input_df[["belongs_to_collection", "homepage", "runtime", "vote_average", "release_year"]]
df.dtypes

belongs_to_collection     object
homepage                  object
runtime                  float64
vote_average             float64
release_year             float64
dtype: object

In [4]:
df.head(20)

Unnamed: 0,belongs_to_collection,homepage,runtime,vote_average,release_year
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",http://toystory.disney.com/toy-story,81.0,7.7,1995.0
1,,,104.0,6.9,1995.0
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",,101.0,6.5,1995.0
3,,,127.0,6.1,1995.0
4,"{'id': 96871, 'name': 'Father of the Bride Col...",,106.0,5.7,1995.0
5,,,170.0,7.7,1995.0
6,,,127.0,6.2,1995.0
7,,,97.0,5.4,1995.0
8,,,106.0,5.5,1995.0
9,"{'id': 645, 'name': 'James Bond Collection', '...",http://www.mgm.com/view/movie/757/Goldeneye/,130.0,6.6,1995.0


In [41]:
# convert belongs_to_collection, homepage and success to binary(0,1)
print(df["belongs_to_collection"].notnull().sum())
collection=df["belongs_to_collection"].notnull().replace([True, False], ["yes", "no"])
collection.name="collection"
print(collection.value_counts())
print(df["homepage"].notnull().sum())
website=df["homepage"].notnull().replace([True, False], ["yes", "no"])
website.name="website"
print(website.value_counts())
success=df["vote_average"].apply(lambda z: "yes" if z > 5 else "no")
success.name="success"
print(success.value_counts())
clean_df = pd.concat([df, collection, website, success], axis=1, ignore_index=False, join="inner")
clean_df.drop(["belongs_to_collection", "homepage", "vote_average"], axis=1, inplace=True)
print(clean_df.columns)
print(clean_df.dtypes)
print(clean_df.head())

1797
no     11109
yes     1797
Name: collection, dtype: int64
3928
no     8978
yes    3928
Name: website, dtype: int64
yes    9563
no     3343
Name: success, dtype: int64
Index(['runtime', 'release_year', 'collection', 'website', 'success'], dtype='object')
runtime         float64
release_year    float64
collection       object
website          object
success          object
dtype: object
   runtime  release_year collection website success
0     81.0        1995.0        yes     yes     yes
1    104.0        1995.0         no      no     yes
2    101.0        1995.0        yes      no     yes
3    127.0        1995.0         no      no     yes
4    106.0        1995.0        yes      no     yes


In [42]:
clean_cat=clean_df.dtypes[clean_df.dtypes == "object"].index.tolist()
print(clean_cat)

['collection', 'website', 'success']


In [44]:
from sklearn.preprocessing import OneHotEncoder
# Create a OneHotEncoder instance
enc =OneHotEncoder(sparse=False)
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df =pd.DataFrame(enc.fit_transform(clean_df[clean_cat]))
# Add the encoded variable names to the DataFrame
encode_df.columns =enc.get_feature_names(clean_cat)
encode_df.head()

Unnamed: 0,collection_no,collection_yes,website_no,website_yes,success_no,success_yes
0,0.0,1.0,0.0,1.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,1.0
2,0.0,1.0,1.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.0,0.0,1.0
4,0.0,1.0,1.0,0.0,0.0,1.0


In [45]:
# Merge one-hot encoded features and drop the originals
clean_df =clean_df.merge(encode_df,left_index=True,right_index=True)
print(clean_df.head())
clean_df=clean_df.drop(clean_cat,axis=1, inplace=True)
print(clean_df.head())

   runtime  release_year collection website success  collection_no  \
0     81.0        1995.0        yes     yes     yes            0.0   
1    104.0        1995.0         no      no     yes            1.0   
2    101.0        1995.0        yes      no     yes            0.0   
3    127.0        1995.0         no      no     yes            1.0   
4    106.0        1995.0        yes      no     yes            0.0   

   collection_yes  website_no  website_yes  success_no  success_yes  
0             1.0         0.0          1.0         0.0          1.0  
1             0.0         1.0          0.0         0.0          1.0  
2             1.0         1.0          0.0         0.0          1.0  
3             0.0         1.0          0.0         0.0          1.0  
4             1.0         1.0          0.0         0.0          1.0  
   runtime  release_year  collection_no  collection_yes  website_no  \
0     81.0        1995.0            0.0             1.0         0.0   
1    104.0       

In [51]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Split our preprocessed data into our features and target arrays
y =clean_df["success_yes"].values
X =clean_df.drop(["success_yes","success_no"],1).values
# Split the preprocessed data into a training and testing dataset
X_train,X_test,y_train,y_test =train_test_split(X,y,random_state=78)

In [52]:
print(type(X_train))
print(X_train)
print(y_train)

<class 'numpy.ndarray'>
[[6.700e+01 1.993e+03 0.000e+00 1.000e+00 1.000e+00 0.000e+00]
 [9.900e+01 1.993e+03 1.000e+00 0.000e+00 1.000e+00 0.000e+00]
 [9.200e+01 2.007e+03 1.000e+00 0.000e+00 1.000e+00 0.000e+00]
 ...
 [1.160e+02 2.011e+03 1.000e+00 0.000e+00 1.000e+00 0.000e+00]
 [9.400e+01 1.996e+03 1.000e+00 0.000e+00 1.000e+00 0.000e+00]
 [9.800e+01 2.012e+03 1.000e+00 0.000e+00 1.000e+00 0.000e+00]]
[0. 1. 1. ... 1. 0. 1.]


In [54]:
from sklearn.preprocessing import StandardScaler
# Create scaler instance
X_scaler =skl.preprocessing.StandardScaler()
# Fit the scaler
X_scaler.fit(X_train)
# Scale the data
X_train_scaled =X_scaler.transform(X_train)
X_test_scaled =X_scaler.transform(X_test)
print(X_train_scaled)
print(X_test_scaled)
print(type(X_train_scaled))

[[-0.93445643 -1.7125053  -2.49354125  2.49354125  0.65985946 -0.65985946]
 [ 0.07832548 -1.7125053   0.40103608 -0.40103608  0.65985946 -0.65985946]
 [-0.14322056  0.12191955  0.40103608 -0.40103608  0.65985946 -0.65985946]
 ...
 [ 0.61636587  0.64604093  0.40103608 -0.40103608  0.65985946 -0.65985946]
 [-0.07992169 -1.31941426  0.40103608 -0.40103608  0.65985946 -0.65985946]
 [ 0.04667605  0.77707128  0.40103608 -0.40103608  0.65985946 -0.65985946]]
[[ 0.10997492  0.25294989  0.40103608 -0.40103608 -1.51547422  1.51547422]
 [-0.11157113  1.43222301  0.40103608 -0.40103608  0.65985946 -0.65985946]
 [ 0.67966474 -0.14014115  0.40103608 -0.40103608  0.65985946 -0.65985946]
 ...
 [-1.78899116 -1.18838391  0.40103608 -0.40103608  0.65985946 -0.65985946]
 [-0.11157113  1.30119266  0.40103608 -0.40103608 -1.51547422  1.51547422]
 [-0.14322056  1.03913197  0.40103608 -0.40103608  0.65985946 -0.65985946]]
<class 'numpy.ndarray'>


In [73]:
# Create the Keras Sequential model
number_input_features =len(X_train_scaled[0])
hidden_nodes_layer1 = 10 
nn =tf.keras.models.Sequential()

In [74]:
# Add our first Dense layer, including the input layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,activation="relu",input_dim=number_input_features))
# Add the output layer that uses a probability activation function
nn.add(tf.keras.layers.Dense(units=1,activation="sigmoid"))

In [75]:
# Check the structure of the Sequential model
nn.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 10)                70        
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 11        
Total params: 81
Trainable params: 81
Non-trainable params: 0
_________________________________________________________________


In [76]:
# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [77]:
# Fit the model to the training data
fit_model =nn.fit(X_train_scaled,y_train,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [78]:
# Evaluate the model using the test data
model_loss,model_accuracy =nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

101/101 - 0s - loss: nan - accuracy: 0.2727
Loss: nan, Accuracy: 0.2726990878582001
