# Learning Model Segment 3

## This version of the model looks at incorporating director data as a feature:

### new features:
* count of movies by director
* success rate by director

In [1]:
# Import our dependencies
import numpy as np
import pandas as pd
import re
import csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import accuracy_score
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Choose type of model to run:
model_type = 'LR'  # LR = logistic regression, NN = neural net, SV = support vector machine
# SVM is not running for some reason???? doesn't finish fit step?
def select_model(mtype):
    if mtype == 'LR':
        return 'U'
    elif mtype == 'NN':
        return 'S'
    elif mtype == 'SV':
        return 'S'

dataset = select_model(model_type)
print(model_type, dataset)

LR U


## Select the dataframe

In [3]:
# input merged dataset and select the appropriate dataframe
df=pd.read_csv("Resources/merged_movies.csv", low_memory=False)

# drop genre variables from TMDB 
df.drop(['g_Action_TM', 'g_Adventure_TM', 'g_Animation_TM', 'g_Comedy_TM', 'g_Crime_TM', 
       'g_Documentary_TM', 'g_Drama_TM', 'g_Family_TM', 'g_Fantasy_TM', 'g_History_TM', 'g_Horror_TM',
       'g_Music_TM', 'g_Mystery_TM', 'g_News', 'g_Reality_TV', 'g_Romance_TM',
       'g_Sci_Fi', 'g_Sport', 'g_Thriller_TM','g_War_TM', 'g_Western_TM'], axis=1, inplace=True)

# drop columns with high percentage of missing values, and not needed
df.drop(["budget_IM", "budget_TM", "collection", "g_Foreign", "genre_name", "metascore", "orig_lang_cd", 
              "original_language", "popularity", "release_year_TM", "revenue", "runtime", "usa_gross_income", 
              "website", "worlwide_gross_income", "year"], axis=1, inplace=True)

# drop columns with too many unique values 
df.drop(["_merge", "country", "genre_list", "imdb_id", "language", "title"], axis=1, inplace=True)

# Drop any rows with missing values. Reassess whether the nulls can/should be recoded and kept.
df.dropna(axis=0, how="any", inplace=True)

# Choose dependent = predicted value:
# Options: avg_vote, reviews_from_critics, reviews_from_users, votes
# For this model pass, use avg_vote >= 7 as success
df["success"] = df["avg_vote"].map(lambda x: 1 if x>= 7.0 else 0)

# Drop unneeded dependent variables
df.drop(["avg_vote", "reviews_from_users", "reviews_from_critics", "votes"], axis=1, inplace=True)

# Categorical variables for encoding, if needed
cat_vars = ['g_Action_IM', 'g_Adult', 'g_Adventure_IM',
       'g_Animation_IM', 'g_Biography', 'g_Comedy_IM', 'g_Crime_IM',
       'g_Documentary_IM', 'g_Drama_IM', 'g_Family_IM', 'g_Fantasy_IM',
       'g_History_IM', 'g_Horror_IM', 'g_Music_IM', 'g_Musical',
       'g_Mystery_IM', 'g_Romance_IM', 'g_Thriller_IM', 'g_War_IM',
       'g_Western_IM']

print(df.columns)

Index(['director_list', 'duration', 'g_Action_IM', 'g_Adult', 'g_Adventure_IM',
       'g_Animation_IM', 'g_Biography', 'g_Comedy_IM', 'g_Crime_IM',
       'g_Documentary_IM', 'g_Drama_IM', 'g_Family_IM', 'g_Fantasy_IM',
       'g_History_IM', 'g_Horror_IM', 'g_Music_IM', 'g_Musical',
       'g_Mystery_IM', 'g_Romance_IM', 'g_Thriller_IM', 'g_War_IM',
       'g_Western_IM', 'release_year_IM', 'success'],
      dtype='object')


## Feature Extraction

In [4]:
# pd.set_option("max_rows", 20)

# extract info from director feature
# there are 52,754 movies with 1 director, and 3,777 movies with 2 directors
# there are 26,593 unique values of "director" (counting a pair of directors as a single value)

# hypothesis: more movies associated with a director indicates the director is considered more worthy of investment, 
# so considered by investors to be more successful?
# 58% of directors have 1 movie
# 17% of directors have 2 movies
#  7% of directors have 3 movies

# hypothesis: can calculate success ratio for each director and use as a feature in the model     


In [5]:
# Extract director names from director_list and put in two columns

# Issue = names with a single quote are inside double quotes
# in merged file looks like: Joe D'Amato""]"
# see how this is handled in database
# may need to modify the search based on the contents of the database

# There's probably a better way of dealing with lists in a df column, this is a rather slow

directors = df["director_list"].copy() # series of directors in a string, index=original
directors_list = directors.apply(eval) # convert series of strings to series of lists of directors, index=original
directors_cols = pd.DataFrame(index=df.index.copy())  # initialize df to hold directors names, index=original

# Create dataframe with director name as index, count of movies, count of successes, win_rate

# Output list of directors into 2 columns
# there must be a better way than this:

# To iterate over items in a pandas Series use series.iterrows() function which returns an iterator, a tuple
# yielding index and value for each item
# Series.iteritems(self)
# index – index of the series
# value – value is the contents of the series item
# it – it is the generator that iterates over the rows of DataFrame.

# index is class int
# value is class list
# value contains up to 2 directors
      
for index, row  in directors_list.iteritems():
    n = len(row)
    if n == 1:
        directors_cols.loc[index, 0] = row[0]
    else:
        directors_cols.loc[index, 0] = row[0]
        directors_cols.loc[index, 1] = row[1]
        
# Update df with directors in two columns. Drop the original string column
print(f'\n df columns {df.columns}')
print(f'\n directors cols {directors_cols.columns}')
print(f'\n length {len(df)}')
directors_cols.rename(columns={0:'Director_1', 1:'Director_2'}, inplace=True)
print(f'\n length: {len(directors_cols)}')

df = pd.concat([df, directors_cols], axis=1, ignore_index=False, join="inner")
df.drop(["director_list"], axis=1, inplace=True) 
print(f'\n length df  {len(df)}')
print(f'\n df columns  {df.columns}')
print(f'\n df:  {df.head()}')


 df columns Index(['director_list', 'duration', 'g_Action_IM', 'g_Adult', 'g_Adventure_IM',
       'g_Animation_IM', 'g_Biography', 'g_Comedy_IM', 'g_Crime_IM',
       'g_Documentary_IM', 'g_Drama_IM', 'g_Family_IM', 'g_Fantasy_IM',
       'g_History_IM', 'g_Horror_IM', 'g_Music_IM', 'g_Musical',
       'g_Mystery_IM', 'g_Romance_IM', 'g_Thriller_IM', 'g_War_IM',
       'g_Western_IM', 'release_year_IM', 'success'],
      dtype='object')

 directors cols Int64Index([0, 1], dtype='int64')

 length 56531

 length: 56531

 length df  56531

 df columns  Index(['duration', 'g_Action_IM', 'g_Adult', 'g_Adventure_IM',
       'g_Animation_IM', 'g_Biography', 'g_Comedy_IM', 'g_Crime_IM',
       'g_Documentary_IM', 'g_Drama_IM', 'g_Family_IM', 'g_Fantasy_IM',
       'g_History_IM', 'g_Horror_IM', 'g_Music_IM', 'g_Musical',
       'g_Mystery_IM', 'g_Romance_IM', 'g_Thriller_IM', 'g_War_IM',
       'g_Western_IM', 'release_year_IM', 'success', 'Director_1',
       'Director_2'],
      dtype='obj

In [6]:
# Calculate movies and success_rate by director
# attach success indicator to directors_cols, by original index
success = df["success"].copy()         # series of success indicators, index=original
director_success = pd.concat([directors_cols, success], axis=1, ignore_index=False, join="inner")
print(director_success.columns)
directors1 = director_success[["Director_1", "success"]].copy()
directors2 = director_success[["Director_2", "success"]].copy()
directors2.dropna(inplace=True)

# get counts by director for both directors 
d1=directors1.groupby("Director_1", as_index=True)["success"].agg(['count', 'sum'])  # index is director name
d1.rename(columns={'count':'count1', 'sum':'sum1'})
d2=directors2.groupby("Director_2", as_index=True)["success"].agg(['count', 'sum'])  # index is director name
d2.rename(columns={'count':'count2', 'sum':'sum2'})
print(f'\n directors1: count unique: {len(d1)}, index: {d1.index}, head: {d1.head()}')
print(f'\n directors2: count_unique: {len(d2)}, index: {d2.index}, head: {d2.head()}')

# then concatenate and combine matching director names
# join counts by index (director name), this will creates nulls from d2
# calculate total movies count and success rate for each director
# output d1 has Director as index, and columns movie_cnt  success_cnt  win_rate
print(d1.index, d2.index)
d1 = d1.join(d2, how="outer", rsuffix="2")
d1 = d1.replace(np.nan,0)
d1["movie_cnt"] = d1["count"] + d1["count2"]
d1["success_cnt"] = d1["sum"] + d1["sum2"] 
d1["win_rate"] = d1["success_cnt"] / d1["movie_cnt"]
d1.drop(["count", "sum", "count2", "sum2"], axis=1, inplace=True)
d1.index.name = "Director"
print(f'\n {d1.index}')
print(f'\n {d1.head()}')
print(f'\n {len(d1)}')

Index(['Director_1', 'Director_2', 'success'], dtype='object')

 directors1: count unique: 25254, index: Index([''Evil' Ted Smith', ''Philthy' Phil Phillips', 'A. Dean Bell',
       'A. Hans Scheirl', 'A. Jagadesh', 'A. Karunakaran', 'A. Mahadev',
       'A. Muthu', 'A. Rajdeep', 'A. Razak Mohaideen',
       ...
       'Özcan Deniz', 'Özer Kiziltan', 'Özgür Bakar', 'Özgür Sevimli',
       'Özgür Yildirim', 'Özhan Eren', 'Øystein Karlsen', 'Øystein Stene',
       'Ümit Cin Güven', 'Ümit Ünal'],
      dtype='object', name='Director_1', length=25254), head:                          count  sum
Director_1                         
'Evil' Ted Smith             1    0
'Philthy' Phil Phillips      1    0
A. Dean Bell                 2    0
A. Hans Scheirl              1    0
A. Jagadesh                  1    0

 directors2: count_unique: 2962, index: Index([' A. Blaine Miller', ' A.L. Vijay', ' A.S. Ravindra Babu', ' Aaron Lim',
       ' Aaron Moorhead', ' Aaron Nee', ' Aaron Osborne', ' Aaron 

In [7]:
df.columns

Index(['duration', 'g_Action_IM', 'g_Adult', 'g_Adventure_IM',
       'g_Animation_IM', 'g_Biography', 'g_Comedy_IM', 'g_Crime_IM',
       'g_Documentary_IM', 'g_Drama_IM', 'g_Family_IM', 'g_Fantasy_IM',
       'g_History_IM', 'g_Horror_IM', 'g_Music_IM', 'g_Musical',
       'g_Mystery_IM', 'g_Romance_IM', 'g_Thriller_IM', 'g_War_IM',
       'g_Western_IM', 'release_year_IM', 'success', 'Director_1',
       'Director_2'],
      dtype='object')

In [8]:
d1.columns

Index(['movie_cnt', 'success_cnt', 'win_rate'], dtype='object')

In [9]:
d1.index.name

'Director'

In [10]:
# Calculate director ratings
# there's got to be a better way to do this
# Get only Director_1 and Director_2 from df, then merge each column with d1 on Director
# the result is Director_1, movie_cnt, win_rate, Director_2, movie_cnt_2, win_rate_2, index is still from the original df

df_subset = df[["Director_1", "Director_2"]].copy()
df_subset.sort_values(by="Director_1")
d1.sort_values(by="Director")
df_subset = df_subset.merge(d1, how="left", left_on = "Director_1", right_on = "Director")
df_subset.sort_values(by="Director_2")
df_subset = df_subset.merge(d1, how="left", left_on = "Director_2", right_on = "Director", suffixes=(None, "_2"))
print(df_subset.columns)

# calculate the max movie_cnt and max win_rate across Director_1, Director_2 for each movie.
# prepare data for calculations
df_subset['movie_cnt_2'].fillna(0, inplace=True)
df_subset['success_cnt_2'].fillna(0, inplace=True)
df_subset['win_rate_2'].fillna(0, inplace=True)
print(df_subset.columns)


Index(['Director_1', 'Director_2', 'movie_cnt', 'success_cnt', 'win_rate',
       'movie_cnt_2', 'success_cnt_2', 'win_rate_2'],
      dtype='object')
Index(['Director_1', 'Director_2', 'movie_cnt', 'success_cnt', 'win_rate',
       'movie_cnt_2', 'success_cnt_2', 'win_rate_2'],
      dtype='object')


In [11]:
# For each movie determine the highest director dating
# Where there are 2 directors, take the max win_rate and the max movies_cnt for the movie score   
# columns are in this order:  0:Director_1  1:Director_2  2:movie_cnt  3:success_cnt 4:win_rate  5:movie_cnt_2  
# 6:success_cnt_2  7:win_rate_2  
# columns in director_ratings: 0:director_score_cnt 1:director_score_rate

directors_ratings = pd.DataFrame(index=df.index.copy())  # initialize df to hold directors ratings, index=original
       
for index, row  in df_subset.iterrows():
    if row[5] is not None and row[5] > row[2]:
        directors_ratings.loc[index, 0] = row[5]
    else:
        directors_ratings.loc[index, 0] = row[2]
    if row[7] is not None and row[7] > row[4]:
        directors_ratings.loc[index, 1] = row[7]
    else:
        directors_ratings.loc[index, 1] = row[4]

print(df_subset.columns)
directors_ratings.columns = ("director_score_cnt", "director_score_rate")
print(directors_ratings.head(20))

# add director ratings to df on original index
df = pd.concat([df, directors_ratings], axis=1, ignore_index=False, join="inner")
print(df.columns)
print(len(df))

Index(['Director_1', 'Director_2', 'movie_cnt', 'success_cnt', 'win_rate',
       'movie_cnt_2', 'success_cnt_2', 'win_rate_2'],
      dtype='object')
    director_score_cnt  director_score_rate
0                  1.0             0.000000
1                  1.0             0.000000
2                  1.0             1.000000
3                  1.0             1.000000
4                  2.0             1.000000
5                  1.0             0.000000
6                  2.0             1.000000
7                  1.0             1.000000
9                  4.0             0.000000
10                11.0             0.545455
11                11.0             0.272727
12                 3.0             0.333333
14                 9.0             0.666667
15                 4.0             1.000000
16                 6.0             0.500000
17                 4.0             0.000000
18                 4.0             1.000000
19                 1.0             0.000000
20           

In [12]:
print(df.columns)
df.drop(["Director_1", "Director_2"], axis=1, inplace=True)

Index(['duration', 'g_Action_IM', 'g_Adult', 'g_Adventure_IM',
       'g_Animation_IM', 'g_Biography', 'g_Comedy_IM', 'g_Crime_IM',
       'g_Documentary_IM', 'g_Drama_IM', 'g_Family_IM', 'g_Fantasy_IM',
       'g_History_IM', 'g_Horror_IM', 'g_Music_IM', 'g_Musical',
       'g_Mystery_IM', 'g_Romance_IM', 'g_Thriller_IM', 'g_War_IM',
       'g_Western_IM', 'release_year_IM', 'success', 'Director_1',
       'Director_2', 'director_score_cnt', 'director_score_rate'],
      dtype='object')


In [13]:
# Drop rows where director data is missing
df.dropna(axis=1, how="any", inplace=True)

In [14]:
df.isnull().sum()

duration            0
g_Action_IM         0
g_Adult             0
g_Adventure_IM      0
g_Animation_IM      0
g_Biography         0
g_Comedy_IM         0
g_Crime_IM          0
g_Documentary_IM    0
g_Drama_IM          0
g_Family_IM         0
g_Fantasy_IM        0
g_History_IM        0
g_Horror_IM         0
g_Music_IM          0
g_Musical           0
g_Mystery_IM        0
g_Romance_IM        0
g_Thriller_IM       0
g_War_IM            0
g_Western_IM        0
release_year_IM     0
success             0
dtype: int64

In [15]:
df.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     9,
               10,
            ...
            70510, 70511, 70512, 70513, 70514, 70516, 70517, 70520, 70523,
            70525],
           dtype='int64', length=56531)

In [16]:
# old code - calculates only number of movies

# pd.set_option("max_rows", None)
# # extract info from director feature
# # there are 52,754 movies with 1 director, and 3,777 movies with 2 directors
# # there are 26,593 unique values of "director" (counting a pair of directors as a single value)

# # hypothesis: more movies associated with a director indicates the director is considered more worthy of investment, 
# # so considered by investors to be more successful?
# # 58% of directors have 1 movie
# # 17% of directors have 2 movies
# #  7% of directors have 3 movies

# # hypothesis: can calculate success ratio for each director and use as a feature in the model

# # Issue = names with a single quote are inside double quotes
# # in merged file looks like: Joe D'Amato""]"
# # see how this is handled in database
# # may need to modify the search based on the contents of the database

# directors = df["director_list"] # directors in a string
# director_list = directors.str.split(',')   # splits string into words outputs to a list
# director_cds = directors.str.split(',', expand=True)   # outputs split words to separate columns
# director_cds.columns=["A", "B"]
# directors1 = director_cds["A"].str.replace('[', '')
# directors1 = directors1.str.replace(']', '')
# directors1 = directors1.str.replace("'", "")
# directors2 = director_cds["B"].str.replace(']', '')
# directors2 = directors2.str.replace("'", "")
# directors1 = directors1.str.strip()
# directors2 = directors2.str.strip().dropna()

# directors1_counts = directors1.value_counts()  # index is now director name
# directors2_counts = directors2.value_counts()

# # print(f'\n directors1: count unique: {len(directors1_counts)}, head: {directors1_counts.head()}')
# # print(f'\n directors2: count_unique: {len(directors2_counts)}, head: {directors2_counts.head()}')

# director_counts = pd.DataFrame()
# director_counts["A"]=directors1_counts  # A is now numpy.int64
# director_counts["B"]=directors2_counts  # B is now numpy.float64
# director_counts["B"] = director_counts["B"].replace(np.nan,0)
# director_counts["total_count"] = director_counts["A"] + director_counts["B"]

# print(director_counts.describe())
# print(director_counts.value_counts())

# # Calculate personal success rate for each director

# Preprocess data

In [17]:
if dataset == 'S':  # create encoded and scaled dataset
    # Encode categorical variables
    # Create a OneHotEncoder instance
    enc = OneHotEncoder(sparse=False)
    # Fit and transform the OneHotEncoder using the categorical variable list
    encode_df =pd.DataFrame(enc.fit_transform(df[cat_vars]))
    # Add the encoded variable names to the DataFrame
    encode_df.columns =enc.get_feature_names(cat_vars)
    # Merge one-hot encoded features and drop the originals
    df =df.merge(encode_df,left_index=True,right_index=True)
    df.drop(cat_vars,axis=1, inplace=True)

## Create Sample Datasets

In [18]:
# Split our preprocessed data into our features and target arrays
y =df["success"].values
X =df.drop(["success"],1).values

# Split the preprocessed data into a training and testing dataset
X_train,X_test,y_train,y_test =train_test_split(X,y,stratify=y,random_state=67)

## Scale data if needed

In [19]:
if dataset == 'S':  # create encoded and scaled dataset
    X_scaler = StandardScaler()
    # Fit the scaler
    X_scaler.fit(X_train)
    # Scale the data
    X_train =X_scaler.transform(X_train)
    X_test =X_scaler.transform(X_test)

## Define the learning model

In [20]:
if model_type == "LR":
    log_classifier =LogisticRegression(solver="lbfgs",max_iter=200)

    # Train the model
    log_classifier.fit(X_train,y_train)

    # Evaluate the model
    y_pred =log_classifier.predict(X_test)
    print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.832


In [21]:
if model_type == "SV":
    # Create the SVM model
    svm =SVC(kernel='linear')
    print('created svm')
    # Train the model
    svm.fit(X_train,y_train)
    print('fit svm')
    # Evaluate the model
    y_pred =svm.predict(X_test_scaled)
    print('predicted svm')
    print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [22]:
if model_type == "NN":
    # Create the Keras Sequential model
    # use validation_split argument to set the proportion of the dataset to reserve for validation
    number_input_features =len(X_train[0])
    hidden_nodes_layer1 = 5
    hidden_nodes_layer2 = 3
    nn =tf.keras.models.Sequential()
    # layers
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,activation="tanh",input_dim=number_input_features))
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2,activation="tanh"))

    # Add the output layer that uses a probability activation function
    nn.add(tf.keras.layers.Dense(units=1,activation="sigmoid"))
    
    # Check the structure of the Sequential model
    nn.summary()

    # Compile the Sequential model together and customize metrics
    nn.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

    # Fit the model to the training data
    fit_model =nn.fit(X_train,y_train,validation_split=.3, epochs=15)

    # Evaluate the model using the test data
    model_loss,model_accuracy =nn.evaluate(X_test,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")
    
    # Create learning curve for both train and validation datasets to evaluate under- and over-fitting

    history_dict=fit_model.history

    loss_values = history_dict['loss']
    val_loss_values = history_dict['val_loss']
    accuracy = history_dict['accuracy']
    val_accuracy = history_dict['val_accuracy']

    epochs = range(1, len(loss_values) + 1)
    fig, ax = plt.subplots(1, 2, figsize=(14, 4))
    #
    # Plot the model accuracy vs Epochs
    #
    ax[0].plot(epochs, accuracy, 'b', label='Training accuracy')
    ax[0].plot(epochs, val_accuracy, 'g', label='Validation accuracy')
    ax[0].set_title('Training & Validation Accuracy', fontsize=16)
    ax[0].set_xlabel('Epochs', fontsize=16)
    ax[0].set_ylabel('Accuracy', fontsize=16)
    ax[0].legend()
    #
    # Plot the loss vs Epochs
    #
    ax[1].plot(epochs, loss_values, 'b', label='Training loss')
    ax[1].plot(epochs, val_loss_values, 'g', label='Validation loss')
    ax[1].set_title('Training & Validation Loss', fontsize=16)
    ax[1].set_xlabel('Epochs', fontsize=16)
    ax[1].set_ylabel('Loss', fontsize=16)
    ax[1].legend()

    plt.show()