In [143]:
import os
import numpy as np
import pandas as pd
from scipy import optimize
from matplotlib import pyplot as plt

In [144]:
md = pd.read_csv('clean_metadata.csv')
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5368 entries, 0 to 5367
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            5368 non-null   int64  
 1   budget                5368 non-null   float64
 2   genres                5368 non-null   object 
 3   id                    5368 non-null   int64  
 4   original_language     5368 non-null   object 
 5   overview              5368 non-null   object 
 6   popularity            5368 non-null   float64
 7   production_companies  5368 non-null   object 
 8   production_countries  5368 non-null   object 
 9   release_date          5368 non-null   object 
 10  revenue               5368 non-null   float64
 11  runtime               5368 non-null   float64
 12  spoken_languages      5368 non-null   object 
 13  title                 5368 non-null   object 
 14  vote_average          5368 non-null   float64
 15  vote_count           

Now, we want to take out the budget, runtime, and language data and plot those data with respect to revenue. 

In [145]:
rg1 = md.copy(deep=True)
column_list = [0, 2, 3, 5, 6, 7, 8, 9, 12, 13, 14, 15]
rg1 = rg1.drop(columns=rg1.columns[column_list])
column_order = ['revenue', 'budget', 'original_language', 'runtime']
rg1 = rg1.reindex(columns=column_order)

In [146]:
rg1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5368 entries, 0 to 5367
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   revenue            5368 non-null   float64
 1   budget             5368 non-null   float64
 2   original_language  5368 non-null   object 
 3   runtime            5368 non-null   float64
dtypes: float64(3), object(1)
memory usage: 167.9+ KB


We now perform some feature engineering, specifically to expand the break apart the original_language column into column specifying the exact language.

In [147]:
rg1['en'] = rg1['original_language'].apply(lambda x: 1 if x=='en' else 0)
rg1['fr'] = rg1['original_language'].apply(lambda x: 1 if x=='fr' else 0)
rg1['cn'] = rg1['original_language'].apply(lambda x: 1 if x=='cn' else 0)
rg1['ru'] = rg1['original_language'].apply(lambda x: 1 if x=='ru' else 0)
rg1['zh'] = rg1['original_language'].apply(lambda x: 1 if x=='zh' else 0)
rg1['es'] = rg1['original_language'].apply(lambda x: 1 if x=='es' else 0)
rg1['sv'] = rg1['original_language'].apply(lambda x: 1 if x=='sv' else 0)
rg1['de'] = rg1['original_language'].apply(lambda x: 1 if x=='de' else 0)
rg1['bn'] = rg1['original_language'].apply(lambda x: 1 if x=='bn' else 0)
rg1['ja'] = rg1['original_language'].apply(lambda x: 1 if x=='ja' else 0)
rg1['ro'] = rg1['original_language'].apply(lambda x: 1 if x=='ro' else 0)
rg1['it'] = rg1['original_language'].apply(lambda x: 1 if x=='it' else 0)
rg1['da'] = rg1['original_language'].apply(lambda x: 1 if x=='da' else 0)
rg1['cs'] = rg1['original_language'].apply(lambda x: 1 if x=='cs' else 0)
rg1['pt'] = rg1['original_language'].apply(lambda x: 1 if x=='pt' else 0)
rg1['fa'] = rg1['original_language'].apply(lambda x: 1 if x=='fa' else 0)
rg1['ko'] = rg1['original_language'].apply(lambda x: 1 if x=='ko' else 0)
rg1['hi'] = rg1['original_language'].apply(lambda x: 1 if x=='hi' else 0)
rg1['el'] = rg1['original_language'].apply(lambda x: 1 if x=='el' else 0)
rg1['pl'] = rg1['original_language'].apply(lambda x: 1 if x=='pl' else 0)
rg1['hu'] = rg1['original_language'].apply(lambda x: 1 if x=='hu' else 0)
rg1['th'] = rg1['original_language'].apply(lambda x: 1 if x=='th' else 0)
rg1['tr'] = rg1['original_language'].apply(lambda x: 1 if x=='tr' else 0)
rg1['bm'] = rg1['original_language'].apply(lambda x: 1 if x=='bm' else 0)
rg1['af'] = rg1['original_language'].apply(lambda x: 1 if x=='af' else 0)
rg1['fi'] = rg1['original_language'].apply(lambda x: 1 if x=='fi' else 0)
rg1['nl'] = rg1['original_language'].apply(lambda x: 1 if x=='nl' else 0)
rg1['vi'] = rg1['original_language'].apply(lambda x: 1 if x=='vi' else 0)
rg1['he'] = rg1['original_language'].apply(lambda x: 1 if x=='he' else 0)
rg1['no'] = rg1['original_language'].apply(lambda x: 1 if x=='no' else 0)
rg1['ta'] = rg1['original_language'].apply(lambda x: 1 if x=='ta' else 0)
rg1['sr'] = rg1['original_language'].apply(lambda x: 1 if x=='sr' else 0)
rg1['nb'] = rg1['original_language'].apply(lambda x: 1 if x=='nb' else 0)
rg1['ca'] = rg1['original_language'].apply(lambda x: 1 if x=='ca' else 0)
rg1['id'] = rg1['original_language'].apply(lambda x: 1 if x=='id' else 0)
rg1['ar'] = rg1['original_language'].apply(lambda x: 1 if x=='ar' else 0)
rg1['ml'] = rg1['original_language'].apply(lambda x: 1 if x=='ml' else 0)
rg1['ka'] = rg1['original_language'].apply(lambda x: 1 if x=='ka' else 0)
rg1['kn'] = rg1['original_language'].apply(lambda x: 1 if x=='kn' else 0)
rg1['is'] = rg1['original_language'].apply(lambda x: 1 if x=='is' else 0)
rg1['te'] = rg1['original_language'].apply(lambda x: 1 if x=='te' else 0)
rg1['mr'] = rg1['original_language'].apply(lambda x: 1 if x=='mr' else 0)
rg1['ur'] = rg1['original_language'].apply(lambda x: 1 if x=='ur' else 0)
rg1 = rg1.drop(['original_language'], axis=1)

In [148]:
rg1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5368 entries, 0 to 5367
Data columns (total 46 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   revenue  5368 non-null   float64
 1   budget   5368 non-null   float64
 2   runtime  5368 non-null   float64
 3   en       5368 non-null   int64  
 4   fr       5368 non-null   int64  
 5   cn       5368 non-null   int64  
 6   ru       5368 non-null   int64  
 7   zh       5368 non-null   int64  
 8   es       5368 non-null   int64  
 9   sv       5368 non-null   int64  
 10  de       5368 non-null   int64  
 11  bn       5368 non-null   int64  
 12  ja       5368 non-null   int64  
 13  ro       5368 non-null   int64  
 14  it       5368 non-null   int64  
 15  da       5368 non-null   int64  
 16  cs       5368 non-null   int64  
 17  pt       5368 non-null   int64  
 18  fa       5368 non-null   int64  
 19  ko       5368 non-null   int64  
 20  hi       5368 non-null   int64  
 21  el       5368 

Now, we convert the pandas dataframe to a numpy array.

In [149]:
data = rg1.to_numpy()

In [150]:
X, y = data[:, 1:], data[:, 0]
m, n = X.shape

In [151]:
print(X)

[[3.00e+07 8.10e+01 1.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 [6.50e+07 1.04e+02 1.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 [1.60e+07 1.27e+02 1.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 ...
 [8.00e+05 1.00e+02 1.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 [2.00e+06 1.07e+02 1.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 [5.00e+06 9.10e+01 0.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]]


In [152]:
print(y)

[3.73554033e+08 2.62797249e+08 8.14521560e+07 ... 1.32861200e+06
 1.26879300e+06 1.41300000e+06]


Now, we normalize the data in X so that gradient descent converges faster (and so that data with higher magnitude is not weighted more heavily). 

In [153]:
def  featureNormalize(X):
    """
    Normalizes the features in X. returns a normalized version of X where
    the mean value of each feature is 0 and the standard deviation
    is 1. This is often a good preprocessing step to do when working with
    learning algorithms.
    
    Parameters
    ----------
    X : array_like
        The dataset of shape (m x n).
    
    Returns
    -------
    X_norm : array_like
        The normalized dataset of shape (m x n).
    
    Instructions
    ------------
    First, for each feature dimension, compute the mean of the feature
    and subtract it from the dataset, storing the mean value in mu. 
    Next, compute the  standard deviation of each feature and divide
    each feature by it's standard deviation, storing the standard deviation 
    in sigma. 
    
    Note that X is a matrix where each column is a feature and each row is
    an example. You needto perform the normalization separately for each feature. 
    
    Hint
    ----
    You might find the 'np.mean' and 'np.std' functions useful.
    """
    # You need to set these values correctly
    X_norm = X.copy()
    mu = np.zeros(X.shape[1])
    sigma = np.zeros(X.shape[1])

    # =========================== YOUR CODE HERE =====================
    for i in range(X.shape[1]):
        a = X[:, i]
        m = np.mean(a)
        mu[i] = np.mean(X[:, i])
        sigma[i] = np.std(X[:, i])
        for j in range(X.shape[0]):
            X_norm[j, i] = (X[j, i] - mu[i]) / sigma[i]
    # ================================================================
    return X_norm, mu, sigma

In [154]:
X_norm, mu, sigma = featureNormalize(X)
print('Computed mean:', mu)
print('Computed standard deviation:', sigma)

Computed mean: [3.11495134e+07 1.09989940e+02 8.94001490e-01 1.63934426e-02
 2.79433681e-03 1.21087928e-02 5.40238450e-03 7.07898659e-03
 1.30402385e-03 4.09836066e-03 0.00000000e+00 7.26527571e-03
 7.45156483e-04 5.21609538e-03 2.04918033e-03 0.00000000e+00
 1.30402385e-03 1.86289121e-04 4.65722802e-03 1.82563338e-02
 1.86289121e-04 7.45156483e-04 3.72578241e-04 3.72578241e-04
 7.45156483e-04 1.86289121e-04 1.86289121e-04 5.58867362e-04
 1.30402385e-03 1.86289121e-04 7.45156483e-04 7.45156483e-04
 4.84351714e-03 5.58867362e-04 1.86289121e-04 1.86289121e-04
 5.58867362e-04 0.00000000e+00 2.04918033e-03 0.00000000e+00
 1.86289121e-04 1.86289121e-04 1.49031297e-03 1.86289121e-04
 3.72578241e-04]
Computed standard deviation: [4.01907264e+07 2.16427067e+01 3.07835712e-01 1.26983061e-01
 5.27875790e-02 1.09371706e-01 7.33021060e-02 8.38383834e-02
 3.60877177e-02 6.38871200e-02 0.00000000e+00 8.49263886e-02
 2.72873822e-02 7.20339346e-02 4.52214682e-02 0.00000000e+00
 3.60877177e-02 1.364750



We can now add the column of 1s at the beginning, corresponding to the intercept feature $x_0 = 1$.

In [155]:
X = np.concatenate([np.ones((m, 1)), X], axis=1)

With our normalized features and our corresponding revenue data, we can run a fairly simple regularized linear regression. First, we create a cost function that returns both the cost J and the gradient grad of our model given some parameters. Then we use scipy.optimize to run the regression.

In [156]:
def linearRegCostFunction(X, y, theta, lambda_=0.0):
    """
    Compute cost and gradient for regularized linear regression 
    with multiple variables. Computes the cost of using theta as
    the parameter for linear regression to fit the data points in X and y. 
    
    Parameters
    ----------
    X : array_like
        The dataset. Matrix with shape (m x n + 1) where m is the 
        total number of examples, and n is the number of features 
        before adding the bias term.
    
    y : array_like
        The functions values at each datapoint. A vector of
        shape (m, ).
    
    theta : array_like
        The parameters for linear regression. A vector of shape (n+1,).
    
    lambda_ : float, optional
        The regularization parameter.
    
    Returns
    -------
    J : float
        The computed cost function. 
    
    grad : array_like
        The value of the cost function gradient w.r.t theta. 
        A vector of shape (n+1, ).
    
    Instructions
    ------------
    Compute the cost and gradient of regularized linear regression for
    a particular choice of theta.
    You should set J to the cost and grad to the gradient.
    """
    # Initialize some useful values
    m = y.size # number of training examples

    # You need to return the following variables correctly 
    J = 0
    grad = np.zeros(theta.shape)

    # ====================== YOUR CODE HERE ======================
    J_1 = (1. / (2. * m)) * np.sum((np.dot(X, theta) - y)**2)
    J_reg = (lambda_ / (2. * m)) * np.sum(theta[1:]**2)
    J = J_1 + J_reg
    
    grad_1 = (1. / m) * np.dot(X.T, (np.dot(X, theta) - y))
    grad_reg = (lambda_ / m) * theta
    grad_reg[0] = 0
    grad = grad_1 + grad_reg
    

    # ============================================================
    return J, grad

In [157]:
def trainLinearReg(linearRegCostFunction, X, y, lambda_=0.0, maxiter=200):
    """
    Trains linear regression using scipy's optimize.minimize.

    Parameters
    ----------
    X : array_like
        The dataset with shape (m x n+1). The bias term is assumed to be concatenated.

    y : array_like
        Function values at each datapoint. A vector of shape (m,).

    lambda_ : float, optional
        The regularization parameter.

    maxiter : int, optional
        Maximum number of iteration for the optimization algorithm.

    Returns
    -------
    theta : array_like
        The parameters for linear regression. This is a vector of shape (n+1,).
    """
    # Initialize Theta
    initial_theta = np.zeros(X.shape[1])

    # Create "short hand" for the cost function to be minimized
    costFunction = lambda t: linearRegCostFunction(X, y, t, lambda_)

    # Now, costFunction is a function that takes in only one argument
    options = {'maxiter': maxiter}

    # Minimize using scipy
    res = optimize.minimize(costFunction, initial_theta, jac=True, method='TNC', options=options)
    return res.x

In [158]:
theta = trainLinearReg(linearRegCostFunction, X, y, lambda_=0)
print(theta)

[-5.08928052e+07  2.96979314e+00  4.24513736e+05  2.66343295e+06
 -9.96210711e+06 -2.53112167e+06 -2.68200158e+06  1.42824014e+07
  4.71991657e+06  5.07349510e+06 -8.07601140e+06  0.00000000e+00
  1.36471499e+07 -3.00659033e+05  1.04243052e+06 -2.31757025e+06
  0.00000000e+00 -5.78169681e+06  6.29020611e+05  2.37862987e+06
 -6.63929684e+06  9.44169212e+05  1.08180846e+06 -1.07494657e+05
  3.33028125e+05  1.82847223e+06 -1.77691871e+05  5.62075686e+05
  1.02732837e+06  5.92051942e+04 -4.35299806e+05  2.05254213e+06
 -9.38796568e+05 -1.01254404e+07  1.17202259e+06  3.37688218e+04
 -4.47411568e+05 -6.67192520e+05  0.00000000e+00 -5.32168111e+06
  0.00000000e+00 -2.99134976e+05  6.88255626e+05  3.42368419e+06
 -8.00555897e+05  1.88405555e+05]
