In [1]:
import pandas as pd 
import statsmodels.api as sm 
import numpy as np 
from scipy.optimize import minimize

# 1. Bisection


One of the most common algorithms for numerical root-finding is *bisection*.

To understand the idea, recall the well-known game where:

- Player A thinks of a secret number between 1 and 100  
- Player B asks if it’s less than 50  
  
  - If yes, B asks if it’s less than 25  
  - If no, B asks if it’s less than 75  
  

And so on.

This is bisection, a relative of [binary search](https://en.wikipedia.org/wiki/Binary_search_algorithm). It works for all sufficiently well behaved increasing continuous functions with $ f(a) < 0 < f(b) $. 

Write an implementation of the bisection algorith, `bisect(f, lower, upper, tol)` which, given a function `f`, a lower bound `lower` and an upper bound `upper` finds the point `x` where `f(x) = 0`. The parameter `tol` is a numerical tolerance, you should stop once your step size is smaller than `tol`.


Use it to minimize the function:

$$
f(x) = \sin(4 (x - 1/4)) + x + x^{20} - 1 \tag{2}
$$

in python: `lambda x: np.sin(4 * (x - 1/4)) + x + x**20 - 1`

The value where f(x) = 0 should be around `0.408`

In [2]:
def bisect(f, lower, upper, tol):
    while upper - lower > tol:
        yup = f(upper)
        ylow = f(lower)
        if yup * ylow < 0:
            xmid = (upper + lower) / 2
            ymid = f(xmid)
            if ymid * yup < 0:
                lower = xmid
            else:
                upper = xmid
        else:
            return 'didnt work (type 1)'
    if xmid:
        return xmid
    else:
        return 'didnt work (type 2)'
    
f = lambda x: np.sin(4 * (x - 1/4)) + x + x**20 - 1
bisect(f, -1, +1, 0.0001)

0.40826416015625

# 1.2 (stretch) Recursive Bisect

Write a recursive version of the bisection algorithm

In [3]:
# while difference in bounds > tol:
# check for sign
def r_bisect(f, lower, upper, tol, xmid='not set'):
    yup = f(upper)
    ylow = f(lower)
    while upper - lower > tol:
        if yup * ylow < 0:
            xmid = (upper + lower) / 2
            ymid = f(xmid)

            if ymid * yup < 0:
                return r_bisect(f, xmid, upper, tol, xmid)
            else:
                return r_bisect(f, lower, xmid, tol, xmid)
        else:
            return 'didnt work, adjust bounds'
    return xmid
    
f = lambda x: np.sin(4 * (x - 1/4)) + x + x**20 - 1
r_bisect(f, -1, +1, 0.0001)

0.40826416015625

# 2.1 Movies Regression

Write the best linear regression model you can on the [Movies Dataset](https://www.kaggle.com/rounakbanik/the-movies-dataset?select=ratings.csv) to predict the profitability of a movie (revenue - budget). Maintain the interpretability of the model.

Few notes:

1. Clean your data! Movies where the budget or revenue are invalid should be thrown out

2. Be creative with feature engineering. You can include processing to one-hot encode the type of movie, etc.

3. The model should be useful for someone **who is thinking about making a movie**. So features like the popularity can't be used. You could, however, use the ratings to figure out if making "good" or "oscar bait" movies is a profitable strategy.

In [4]:
def director_finder_3000(series):
    for e in eval(series):
        if e['job'] == 'Director':
            return e['name']
        
data_url = r'../Data/movie_data/credits.csv'
dataframe = pd.read_csv(data_url)
cred = dataframe
cred['director'] = cred['crew'].apply(director_finder_3000)

data_url = r'../Data/movie_data/movies_metadata.csv'
dataframe = pd.read_csv(data_url)
meta = dataframe

meta['id'] = pd.to_numeric(meta['id'], errors='coerce')
meta = meta.fillna(0)
cred['id'] = cred['id'].astype(int)
meta['id'] = meta['id'].astype(int)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [51]:
df = meta.merge(cred, on='id', how='left')
df = df[['director', 'belongs_to_collection', 'budget', 'original_language', 'release_date', 'revenue', 'runtime',
         'vote_average', 'vote_count']]
# quantitative features
money = ['budget','revenue']
quant = ['vote_average', 'vote_count', 'revenue','budget','runtime']# 'release_month']
# features to make into polynomials
to_poly = ['budget']#,'runtime']
# qualitative features
qual = ['director','original_language']
# externally sourced list of 'top 10' directors
top_dir = ['Christopher Nolan','Steven Spielberg', 'Quentin Tarantino', 'Martin Scorsese', 'David Fincher','Stanley Kubrick','Robert Zemeckis','Ridley Scott','Francis Ford Coppola','Clint Eastwood']

to_drop = ['original_language','release_date','director','revenue','profit']#'vote_average']


# df['top10dir'] = df['director'].isin(top_dir).astype(int)

    
    
# df['in_eng'] = df['original_language'] == 'en'
# df['in_eng'] = df.in_eng.astype(int)

df['belongs_to_collection'] = (df['belongs_to_collection'] != 0).astype(int)

# df['release_month'] = (df['release_date'].astype(str)
#                                          .str.slice(start=5,stop=7))
# df['release_month'].value_counts()

# force numeric columns into int, filling NAs with 0
for col in quant:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
#     df[col+'_missing'] = (df[col] == 0).astype(int)


for col in to_poly:
    df[col + '2'] = df[col]**2
    
df['profit'] = (df['revenue'] - df['budget'])

X = df.copy()
X = X.drop(to_drop, 1)

Y = df.profit
for c in X.columns:
    if X[c].std()<.001:
        X = X.drop(c,1)
        print(c)
X = sm.add_constant(X)
mod = sm.OLS(Y, X).fit(cov_type = 'HC2')
mod.summary()
# df.tail()



0,1,2,3
Dep. Variable:,profit,R-squared:,0.636
Model:,OLS,Adj. R-squared:,0.636
Method:,Least Squares,F-statistic:,261.0
Date:,"Sun, 31 Jan 2021",Prob (F-statistic):,5.779999999999999e-168
Time:,13:12:25,Log-Likelihood:,-850810.0
No. Observations:,45542,AIC:,1702000.0
Df Residuals:,45538,BIC:,1702000.0
Df Model:,3,,
Covariance Type:,HC2,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-132.6765,17.677,-7.505,0.000,-167.323,-98.030
belongs_to_collection,-4.9608,1.432,-3.464,0.001,-7.768,-2.154
budget,-0.2423,0.123,-1.973,0.049,-0.483,-0.002
runtime,-1.454e+04,1935.134,-7.515,0.000,-1.83e+04,-1.07e+04
vote_average,-719.3151,99.689,-7.216,0.000,-914.702,-523.928
vote_count,6.966e+04,3561.718,19.559,0.000,6.27e+04,7.66e+04
budget2,5.689e-09,1.04e-09,5.451,0.000,3.64e-09,7.73e-09

0,1,2,3
Omnibus:,62705.528,Durbin-Watson:,1.971
Prob(Omnibus):,0.0,Jarque-Bera (JB):,126100899.61
Skew:,7.223,Prob(JB):,0.0
Kurtosis:,260.381,Cond. No.,1e+16


# 2.2 Movies Manual Regression

Use your `X` and `y` matrix from 2.1 to calculate the linear regression yourself using the normal equation $(X^T X)^{-1}X^Ty$.

Verify that the coefficients are the same.

In [52]:
# (X-transpose matrix times x-regular)inverted, times x-transpose, times y matrix
realx = np.linalg.inv(X.T@X) @ X.T @ Y
realx

0   -4.133007e+05
1    1.020861e+07
2   -2.608059e-01
3   -4.021553e+03
4   -2.737260e+05
5    6.884777e+04
6    5.717372e-09
dtype: float64

# 2.3 Movies gradient descent regression

Use your `X` and `y` matrix from 2.1 to calculate the linear regression yourself using **gradient descent**. 

Hint: use `scipy.optimize` and remember we're finding the $\beta$ that minimizes the squared loss function of linear regression: $f(\beta) = (\beta X - y)^2$. This will look like part 3 of this lecture.

Verify your coefficients are similar to the ones in 2.1 and 2.2. They won't necessarily be exactly the same, but should be roughly similar.

In [53]:
target = lambda betas : np.sum((Y - (X @ betas)) ** 2)

In [54]:
betas = np.random.rand(X.shape[1])
est = minimize(target, betas, method='powell', options= {'maxiter':15000, 'ftol':0.000001} )
print(est)
estx = est.x

   direc: array([[-3.54267958e+13,  1.72424548e+12,  1.02236926e+05,
         2.93130097e+11,  1.31557898e+11,  9.58918197e+09,
        -2.23884400e-03],
       [-1.46925106e+12, -1.82160443e+12,  7.79112518e+03,
        -3.93748106e+09,  3.38942767e+11, -1.99523947e+08,
        -4.97649789e-05],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00],
       [-4.27037748e+11,  4.26639628e+11,  3.22542387e+03,
         1.12074534e+09,  5.92247369e+10, -2.77263209e+08,
         1.28499384e-05],
       [ 1.13380152e+13, -1.14593997e+13,  2.12179043e+06,
         3.04903956e+10, -4.29470297e+12, -6.32103006e+10,
        -5.48987933e-03],
       [ 2.17634430e+14,  7.80692666e+00,  5.25615173e+07,
        -2.92343956e+12,  9.42285605e+12,  2.44698443e+11,
        -3.32973878e-01],
       [-1.23879905e+09, -2.10988067e+09,  5.33107691e+00,
         1.88205978e+07,  4.21446381e+07, -2.89014314e+05,
        

In [55]:
realx - estx

0   -3.611317e+03
1   -8.024525e+02
2    9.299447e-06
3    3.889105e+00
4    4.202073e+02
5    7.336304e-01
6   -3.777937e-13
dtype: float64