In [277]:
import os
import numpy as np
import pandas as pd
from scipy import optimize
from matplotlib import pyplot as plt

In [278]:
md = pd.read_csv('clean_metadata.csv')
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5368 entries, 0 to 5367
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            5368 non-null   int64  
 1   budget                5368 non-null   float64
 2   genres                5368 non-null   object 
 3   id                    5368 non-null   int64  
 4   original_language     5368 non-null   object 
 5   overview              5368 non-null   object 
 6   popularity            5368 non-null   float64
 7   production_companies  5368 non-null   object 
 8   production_countries  5368 non-null   object 
 9   release_date          5368 non-null   object 
 10  revenue               5368 non-null   float64
 11  runtime               5368 non-null   float64
 12  spoken_languages      5368 non-null   object 
 13  title                 5368 non-null   object 
 14  vote_average          5368 non-null   float64
 15  vote_count           

Now, we want to take out the budget, runtime, and language data and plot those data with respect to revenue. 

In [279]:
rg1 = md.copy(deep=True)
column_list = [0, 2, 3, 5, 6, 7, 8, 9, 12, 13, 14, 15]
rg1 = rg1.drop(columns=rg1.columns[column_list])
column_order = ['revenue', 'budget', 'original_language', 'runtime']
rg1 = rg1.reindex(columns=column_order)

In [280]:
rg1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5368 entries, 0 to 5367
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   revenue            5368 non-null   float64
 1   budget             5368 non-null   float64
 2   original_language  5368 non-null   object 
 3   runtime            5368 non-null   float64
dtypes: float64(3), object(1)
memory usage: 167.9+ KB


We now perform some feature engineering, specifically to expand the break apart the original_language column into column specifying the exact language.

In [281]:
rg1['en'] = rg1['original_language'].apply(lambda x: 1 if x=='en' else 0)
rg1['fr'] = rg1['original_language'].apply(lambda x: 1 if x=='fr' else 0)
rg1['cn'] = rg1['original_language'].apply(lambda x: 1 if x=='cn' else 0)
rg1['ru'] = rg1['original_language'].apply(lambda x: 1 if x=='ru' else 0)
rg1['zh'] = rg1['original_language'].apply(lambda x: 1 if x=='zh' else 0)
rg1['es'] = rg1['original_language'].apply(lambda x: 1 if x=='es' else 0)
rg1['sv'] = rg1['original_language'].apply(lambda x: 1 if x=='sv' else 0)
rg1['de'] = rg1['original_language'].apply(lambda x: 1 if x=='de' else 0)
rg1['bn'] = rg1['original_language'].apply(lambda x: 1 if x=='bn' else 0)
rg1['ja'] = rg1['original_language'].apply(lambda x: 1 if x=='ja' else 0)
rg1['ro'] = rg1['original_language'].apply(lambda x: 1 if x=='ro' else 0)
rg1['it'] = rg1['original_language'].apply(lambda x: 1 if x=='it' else 0)
rg1['da'] = rg1['original_language'].apply(lambda x: 1 if x=='da' else 0)
rg1['cs'] = rg1['original_language'].apply(lambda x: 1 if x=='cs' else 0)
rg1['pt'] = rg1['original_language'].apply(lambda x: 1 if x=='pt' else 0)
rg1['fa'] = rg1['original_language'].apply(lambda x: 1 if x=='fa' else 0)
rg1['ko'] = rg1['original_language'].apply(lambda x: 1 if x=='ko' else 0)
rg1['hi'] = rg1['original_language'].apply(lambda x: 1 if x=='hi' else 0)
rg1['el'] = rg1['original_language'].apply(lambda x: 1 if x=='el' else 0)
rg1['pl'] = rg1['original_language'].apply(lambda x: 1 if x=='pl' else 0)
rg1['hu'] = rg1['original_language'].apply(lambda x: 1 if x=='hu' else 0)
rg1['th'] = rg1['original_language'].apply(lambda x: 1 if x=='th' else 0)
rg1['tr'] = rg1['original_language'].apply(lambda x: 1 if x=='tr' else 0)
rg1['bm'] = rg1['original_language'].apply(lambda x: 1 if x=='bm' else 0)
rg1['af'] = rg1['original_language'].apply(lambda x: 1 if x=='af' else 0)
rg1['fi'] = rg1['original_language'].apply(lambda x: 1 if x=='fi' else 0)
rg1['nl'] = rg1['original_language'].apply(lambda x: 1 if x=='nl' else 0)
rg1['vi'] = rg1['original_language'].apply(lambda x: 1 if x=='vi' else 0)
rg1['he'] = rg1['original_language'].apply(lambda x: 1 if x=='he' else 0)
rg1['no'] = rg1['original_language'].apply(lambda x: 1 if x=='no' else 0)
rg1['ta'] = rg1['original_language'].apply(lambda x: 1 if x=='ta' else 0)
rg1['sr'] = rg1['original_language'].apply(lambda x: 1 if x=='sr' else 0)
rg1['nb'] = rg1['original_language'].apply(lambda x: 1 if x=='nb' else 0)
rg1['ca'] = rg1['original_language'].apply(lambda x: 1 if x=='ca' else 0)
rg1['id'] = rg1['original_language'].apply(lambda x: 1 if x=='id' else 0)
rg1['ar'] = rg1['original_language'].apply(lambda x: 1 if x=='ar' else 0)
rg1['ml'] = rg1['original_language'].apply(lambda x: 1 if x=='ml' else 0)
rg1['ka'] = rg1['original_language'].apply(lambda x: 1 if x=='ka' else 0)
rg1['kn'] = rg1['original_language'].apply(lambda x: 1 if x=='kn' else 0)
rg1['is'] = rg1['original_language'].apply(lambda x: 1 if x=='is' else 0)
rg1['te'] = rg1['original_language'].apply(lambda x: 1 if x=='te' else 0)
rg1['mr'] = rg1['original_language'].apply(lambda x: 1 if x=='mr' else 0)
rg1['ur'] = rg1['original_language'].apply(lambda x: 1 if x=='ur' else 0)
rg1 = rg1.drop(['original_language'], axis=1)

In [282]:
rg1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5368 entries, 0 to 5367
Data columns (total 46 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   revenue  5368 non-null   float64
 1   budget   5368 non-null   float64
 2   runtime  5368 non-null   float64
 3   en       5368 non-null   int64  
 4   fr       5368 non-null   int64  
 5   cn       5368 non-null   int64  
 6   ru       5368 non-null   int64  
 7   zh       5368 non-null   int64  
 8   es       5368 non-null   int64  
 9   sv       5368 non-null   int64  
 10  de       5368 non-null   int64  
 11  bn       5368 non-null   int64  
 12  ja       5368 non-null   int64  
 13  ro       5368 non-null   int64  
 14  it       5368 non-null   int64  
 15  da       5368 non-null   int64  
 16  cs       5368 non-null   int64  
 17  pt       5368 non-null   int64  
 18  fa       5368 non-null   int64  
 19  ko       5368 non-null   int64  
 20  hi       5368 non-null   int64  
 21  el       5368 

Now, we convert the pandas dataframe to a numpy array.

In [283]:
data = rg1.to_numpy()

In [284]:
X, y = data[:, 1:], data[:, 0]
m, n = X.shape

In [285]:
print(X)

[[3.00e+07 8.10e+01 1.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 [6.50e+07 1.04e+02 1.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 [1.60e+07 1.27e+02 1.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 ...
 [8.00e+05 1.00e+02 1.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 [2.00e+06 1.07e+02 1.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 [5.00e+06 9.10e+01 0.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]]


In [286]:
print(y)

[3.73554033e+08 2.62797249e+08 8.14521560e+07 ... 1.32861200e+06
 1.26879300e+06 1.41300000e+06]


Now, we normalize the data in X so that gradient descent converges faster (and so that data with higher magnitude is not weighted more heavily). 

In [287]:
def  featureNormalize(X):
    """
    Normalizes the features in X. returns a normalized version of X where
    the mean value of each feature is 0 and the standard deviation
    is 1. This is often a good preprocessing step to do when working with
    learning algorithms.
    
    Parameters
    ----------
    X : array_like
        The dataset of shape (m x n).
    
    Returns
    -------
    X_norm : array_like
        The normalized dataset of shape (m x n).
    
    """
    X_norm = X.copy()
    mu = np.zeros(X.shape[1])
    sigma = np.zeros(X.shape[1])

    for i in range(X.shape[1]):
        a = X[:, i]
        m = np.mean(a)
        mu[i] = np.mean(X[:, i])
        sigma[i] = np.std(X[:, i])
        for j in range(X.shape[0]):
            X_norm[j, i] = (X[j, i] - mu[i]) / sigma[i]

    return X_norm, mu, sigma

In [288]:
X_norm, mu, sigma = featureNormalize(X)
print('Computed mean:', mu)
print('Computed standard deviation:', sigma)

Computed mean: [3.11495134e+07 1.09989940e+02 8.94001490e-01 1.63934426e-02
 2.79433681e-03 1.21087928e-02 5.40238450e-03 7.07898659e-03
 1.30402385e-03 4.09836066e-03 0.00000000e+00 7.26527571e-03
 7.45156483e-04 5.21609538e-03 2.04918033e-03 0.00000000e+00
 1.30402385e-03 1.86289121e-04 4.65722802e-03 1.82563338e-02
 1.86289121e-04 7.45156483e-04 3.72578241e-04 3.72578241e-04
 7.45156483e-04 1.86289121e-04 1.86289121e-04 5.58867362e-04
 1.30402385e-03 1.86289121e-04 7.45156483e-04 7.45156483e-04
 4.84351714e-03 5.58867362e-04 1.86289121e-04 1.86289121e-04
 5.58867362e-04 0.00000000e+00 2.04918033e-03 0.00000000e+00
 1.86289121e-04 1.86289121e-04 1.49031297e-03 1.86289121e-04
 3.72578241e-04]
Computed standard deviation: [4.01907264e+07 2.16427067e+01 3.07835712e-01 1.26983061e-01
 5.27875790e-02 1.09371706e-01 7.33021060e-02 8.38383834e-02
 3.60877177e-02 6.38871200e-02 0.00000000e+00 8.49263886e-02
 2.72873822e-02 7.20339346e-02 4.52214682e-02 0.00000000e+00
 3.60877177e-02 1.364750



We divide up the features into a training set and a testing set. 

In [289]:
eighty = int(np.floor(0.8 * m))
indices = np.random.permutation(m)
train_idx, test_idx = indices[:eighty], indices[eighty:]
X_train, X_test = X[train_idx, :], X[test_idx, :]
y_train, y_test = y[train_idx], y[test_idx]

We can now add the column of 1s at the beginning, corresponding to the intercept feature $x_0 = 1$.

In [290]:
X_train_aug = np.concatenate([np.ones((y_train.size, 1)), X_train], axis=1)
X_test_aug = np.concatenate([np.ones((y_test.size, 1)), X_test], axis=1)

With our normalized features and our corresponding revenue data, we can run a fairly simple regularized linear regression. First, we create a cost function that returns both the cost J and the gradient grad of our model given some parameters. Then we use scipy.optimize to run the regression.

In [291]:
def linearRegCostFunction(X, y, theta, lambda_=0.0):
    """
    Compute cost and gradient for regularized linear regression 
    with multiple variables. Computes the cost of using theta as
    the parameter for linear regression to fit the data points in X and y. 
    
    Parameters
    ----------
    X : array_like
        The dataset. Matrix with shape (m x n + 1) where m is the 
        total number of examples, and n is the number of features 
        before adding the bias term.
    
    y : array_like
        The functions values at each datapoint. A vector of
        shape (m, ).
    
    theta : array_like
        The parameters for linear regression. A vector of shape (n+1,).
    
    lambda_ : float, optional
        The regularization parameter.
    
    Returns
    -------
    J : float
        The computed cost function. 
    
    grad : array_like
        The value of the cost function gradient w.r.t theta. 
        A vector of shape (n+1, ).
    """
    m = y.size # number of training examples

    J = 0
    grad = np.zeros(theta.shape)

    J_1 = (1. / (2. * m)) * np.sum((np.dot(X, theta) - y)**2)
    J_reg = (lambda_ / (2. * m)) * np.sum(theta[1:]**2)
    J = J_1 + J_reg
    
    grad_1 = (1. / m) * np.dot(X.T, (np.dot(X, theta) - y))
    grad_reg = (lambda_ / m) * theta
    grad_reg[0] = 0
    grad = grad_1 + grad_reg
    
    return J, grad

In [292]:
def trainLinearReg(linearRegCostFunction, X, y, lambda_=0.0, maxiter=200):
    """
    Trains linear regression using scipy's optimize.minimize.

    Parameters
    ----------
    X : array_like
        The dataset with shape (m x n+1). The bias term is assumed to be concatenated.

    y : array_like
        Function values at each datapoint. A vector of shape (m,).

    lambda_ : float, optional
        The regularization parameter.

    maxiter : int, optional
        Maximum number of iteration for the optimization algorithm.

    Returns
    -------
    theta : array_like
        The parameters for linear regression. This is a vector of shape (n+1,).
    """
    # Initialize Theta
    initial_theta = np.zeros(X.shape[1])

    # Create "short hand" for the cost function to be minimized
    costFunction = lambda t: linearRegCostFunction(X, y, t, lambda_)

    # Now, costFunction is a function that takes in only one argument
    options = {'maxiter': maxiter}

    # Minimize using scipy
    res = optimize.minimize(costFunction, initial_theta, jac=True, method='TNC', options=options)
    return res.x, res.fun

In [293]:
theta, cost = trainLinearReg(linearRegCostFunction, X_train_aug, y_train, lambda_=0)
print(theta)
print(cost)

[-4.32719705e+07  2.96137706e+00  3.72556180e+05  1.41817491e+06
 -1.01878675e+07 -2.44609652e+06 -4.84330675e+06  6.40466913e+05
  3.16686512e+06  4.11087717e+06 -1.15851872e+07  0.00000000e+00
  1.37579139e+07 -7.11489389e+05 -4.12426202e+06 -2.31068023e+06
  0.00000000e+00 -6.55467088e+06  0.00000000e+00 -3.40095319e+05
 -5.02675209e+06  0.00000000e+00  3.17354927e+05 -3.40357087e+05
 -1.99990129e+04  1.03212282e+06 -2.82722124e+05  3.11441049e+05
 -2.88215936e+04 -4.36165907e+05 -4.37482054e+05  7.76921743e+05
 -1.67411941e+06 -7.31366088e+06  1.58922424e+05 -1.37261312e+05
 -5.58674870e+05 -8.75044098e+05  0.00000000e+00 -2.34969166e+06
  0.00000000e+00 -3.08417966e+05  3.89638564e+05  4.07117203e+06
 -6.62263971e+05 -5.56034770e+04]
6574514404333482.0


We plot the learning curve of this model to assess its overall accuracy.

In [275]:
def learningCurve(X, y, Xval, yval, lambda_=0):
    """
    Generates the train and cross validation set errors needed to plot a learning curve
    returns the train and cross validation set errors for a learning curve. 
    
    In this function, you will compute the train and test errors for
    dataset sizes from 1 up to m. In practice, when working with larger
    datasets, you might want to do this in larger intervals.
    
    Parameters
    ----------
    X : array_like
        The training dataset. Matrix with shape (m x n + 1) where m is the 
        total number of examples, and n is the number of features 
        before adding the bias term.
    
    y : array_like
        The functions values at each training datapoint. A vector of
        shape (m, ).
    
    Xval : array_like
        The validation dataset. Matrix with shape (m_val x n + 1) where m is the 
        total number of examples, and n is the number of features 
        before adding the bias term.
    
    yval : array_like
        The functions values at each validation datapoint. A vector of
        shape (m_val, ).
    
    lambda_ : float, optional
        The regularization parameter.
    
    Returns
    -------
    error_train : array_like
        A vector of shape m. error_train[i] contains the training error for
        i examples.
    error_val : array_like
        A vecotr of shape m. error_val[i] contains the validation error for
        i training examples. 
    """
    m = y.size

    error_train = np.zeros(m)
    error_val   = np.zeros(m)

    for i in range(1, m+1, 50):
        # compute theta over i training examples
        theta = trainLinearReg(linearRegCostFunction, X[:i, :], y[:i], lambda_)
        # compute train error
        error_train[i-1] = (1. / (2. * i)) * np.sum((np.dot(X[:i, :], theta) - y[:i])**2)
        # compute cross val error
        error_val[i-1] = (1. / (2. * yval.size)) * np.sum((np.dot(Xval, theta) - yval)**2)
    return error_train, error_val

In [276]:
error_train, error_val = learningCurve(X_train_aug, y_train, X_test_aug, y_test, lambda_=0)

print('# Training Examples\tTrain Error\tCross Validation Error')
for i in range(m):
    print('  \t%d\t\t%f\t%f' % (i+1, error_train[i], error_val[i]))

# Training Examples	Train Error	Cross Validation Error
  	1		0.000000	14120666497599026.000000
  	2		0.000000	0.000000
  	3		0.000000	0.000000
  	4		0.000000	0.000000
  	5		0.000000	0.000000
  	6		0.000000	0.000000
  	7		0.000000	0.000000
  	8		0.000000	0.000000
  	9		0.000000	0.000000
  	10		0.000000	0.000000
  	11		0.000000	0.000000
  	12		0.000000	0.000000
  	13		0.000000	0.000000
  	14		0.000000	0.000000
  	15		0.000000	0.000000
  	16		0.000000	0.000000
  	17		0.000000	0.000000
  	18		0.000000	0.000000
  	19		0.000000	0.000000
  	20		0.000000	0.000000
  	21		0.000000	0.000000
  	22		0.000000	0.000000
  	23		0.000000	0.000000
  	24		0.000000	0.000000
  	25		0.000000	0.000000
  	26		0.000000	0.000000
  	27		0.000000	0.000000
  	28		0.000000	0.000000
  	29		0.000000	0.000000
  	30		0.000000	0.000000
  	31		0.000000	0.000000
  	32		0.000000	0.000000
  	33		0.000000	0.000000
  	34		0.000000	0.000000
  	35		0.000000	0.000000
  	36		0.000000	0.000000
  	37		0.000000	0.000000
  	38		0.0000

  	1533		0.000000	0.000000
  	1534		0.000000	0.000000
  	1535		0.000000	0.000000
  	1536		0.000000	0.000000
  	1537		0.000000	0.000000
  	1538		0.000000	0.000000
  	1539		0.000000	0.000000
  	1540		0.000000	0.000000
  	1541		0.000000	0.000000
  	1542		0.000000	0.000000
  	1543		0.000000	0.000000
  	1544		0.000000	0.000000
  	1545		0.000000	0.000000
  	1546		0.000000	0.000000
  	1547		0.000000	0.000000
  	1548		0.000000	0.000000
  	1549		0.000000	0.000000
  	1550		0.000000	0.000000
  	1551		7919577171769867.000000	6175970498800615.000000
  	1552		0.000000	0.000000
  	1553		0.000000	0.000000
  	1554		0.000000	0.000000
  	1555		0.000000	0.000000
  	1556		0.000000	0.000000
  	1557		0.000000	0.000000
  	1558		0.000000	0.000000
  	1559		0.000000	0.000000
  	1560		0.000000	0.000000
  	1561		0.000000	0.000000
  	1562		0.000000	0.000000
  	1563		0.000000	0.000000
  	1564		0.000000	0.000000
  	1565		0.000000	0.000000
  	1566		0.000000	0.000000
  	1567		0.000000	0.000000
  	1568		0.000000	0.00000

  	3166		0.000000	0.000000
  	3167		0.000000	0.000000
  	3168		0.000000	0.000000
  	3169		0.000000	0.000000
  	3170		0.000000	0.000000
  	3171		0.000000	0.000000
  	3172		0.000000	0.000000
  	3173		0.000000	0.000000
  	3174		0.000000	0.000000
  	3175		0.000000	0.000000
  	3176		0.000000	0.000000
  	3177		0.000000	0.000000
  	3178		0.000000	0.000000
  	3179		0.000000	0.000000
  	3180		0.000000	0.000000
  	3181		0.000000	0.000000
  	3182		0.000000	0.000000
  	3183		0.000000	0.000000
  	3184		0.000000	0.000000
  	3185		0.000000	0.000000
  	3186		0.000000	0.000000
  	3187		0.000000	0.000000
  	3188		0.000000	0.000000
  	3189		0.000000	0.000000
  	3190		0.000000	0.000000
  	3191		0.000000	0.000000
  	3192		0.000000	0.000000
  	3193		0.000000	0.000000
  	3194		0.000000	0.000000
  	3195		0.000000	0.000000
  	3196		0.000000	0.000000
  	3197		0.000000	0.000000
  	3198		0.000000	0.000000
  	3199		0.000000	0.000000
  	3200		0.000000	0.000000
  	3201		6722731760192117.000000	6103884042973016.00000

IndexError: index 4294 is out of bounds for axis 0 with size 4294