In [1]:
import scipy.optimize as opt
from scipy.stats import norm
import numpy as np
import pandas as pd
import json
import sympy as sy
import statsmodels.api as sm

In [2]:
# FINDING NANS by Jasleen

# percent_missing=X.isnull().sum() * 100 / len(X)
# missing_value_df = pd.DataFrame({'column_name': X.columns,
#                                  'percent_missing': percent_missing,
#                                   'count_missing':X.isnull().sum()})
# missing_value_df.sort_values('percent_missing', inplace=True)

# 1. Bisection


One of the most common algorithms for numerical root-finding is *bisection*.

To understand the idea, recall the well-known game where:

- Player A thinks of a secret number between 1 and 100  
- Player B asks if it’s less than 50  
  
  - If yes, B asks if it’s less than 25  
  - If no, B asks if it’s less than 75  
  

And so on.

This is bisection, a relative of [binary search](https://en.wikipedia.org/wiki/Binary_search_algorithm). It works for all sufficiently well behaved increasing continuous functions with $ f(a) < 0 < f(b) $. 

Write an implementation of the bisection algorith, `bisect(f, lower, upper, tol)` which, given a function `f`, a lower bound `lower` and an upper bound `upper` finds the point `x` where `f(x) = 0`. The parameter `tol` is a numerical tolerance, you should stop once your step size is smaller than `tol`.


Use it to minimize the function:

$$
f(x) = \sin(4 (x - 1/4)) + x + x^{20} - 1 \tag{2}
$$

in python: `lambda x: np.sin(4 * (x - 1/4)) + x + x**20 - 1`

The value where f(x) = 0 should be around `0.408`

In [3]:
# received help from Javad

In [4]:
x = sy.symbols("x")
sy.diff(sy.sin(4 * (x - 1/4)) + x + x**20 - 1, x)

20*x**19 + 4*cos(4*x - 1.0) + 1

In [5]:
def f(x):
#     return 4 * np.cos((4 * x - 1)) + 1 + 20 * x**19
    return np.sin(4 * (x - 1/4)) + x + x**20 - 1

In [6]:
def bisect(f, lower, upper, tol):
    mid = (upper + lower) / 2
    y_left = f(lower)
    y_right = f(upper)
    y_mid = f(mid)
    
    if abs(upper - lower) < tol:
        return mid
    
    if y_left * y_right > 0:
        print("change your lower/upper")
        return
    
    if y_left * y_mid > 0: 
        return bisect(f, mid, upper, tol)
    else:
        return bisect(f, lower, mid, tol)

#     SOLUTION:
#     while upper - lower > tol:
#         mid = 0.5 * (upper + lower)
#         if f(mid) > 0:   # root is between lower and middle
#             lower, upper = lower, mid
#         else:            # root is between middle and upper
#             lower, upper = mid, upper
            
#     return 0.5 * (upper + lower)

In [7]:
x_min = bisect(f, 0, 1, .001)
x_min

0.40869140625

# 1.2 (stretch) Recursive Bisect

Write a recursive version of the bisection algorithm

# 2.1 Movies Regression

Write the best linear regression model you can on the [Movies Dataset](https://www.kaggle.com/rounakbanik/the-movies-dataset?select=ratings.csv) to predict the profitability of a movie (revenue - budget). Maintain the interpretability of the model.

Few notes:

1. Clean your data! Movies where the budget or revenue are invalid should be thrown out

2. Be creative with feature engineering. You can include processing to one-hot encode the type of movie, etc.

3. The model should be useful for someone **who is thinking about making a movie**. So features like the popularity can't be used. You could, however, use the ratings to figure out if making "good" or "oscar bait" movies is a profitable strategy.

In [8]:
df = pd.read_csv("data/movies_metadata.csv")
df = df.dropna(subset = ["budget", "revenue", "runtime"])
df.budget = df.budget.astype(int)
df.revenue = df.revenue.astype(int)
df = df[df.budget != 0]
df = df[df.revenue != 0]
df = df.drop(columns = ["id", "imdb_id", "overview", "poster_path", "homepage", "popularity", "vote_average", "vote_count",
                       "tagline", "title", "original_title", "video"])

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
# Received help from Javad

X = df.copy()

X.belongs_to_collection = X.belongs_to_collection.fillna("{'name': 'No collection'}")
X.belongs_to_collection = X.belongs_to_collection.apply(eval)
X.belongs_to_collection = X.belongs_to_collection.apply(lambda x: x["name"])

X.release_date = pd.to_datetime(X.release_date)
X["modern"] = (X.release_date > "2000-01-01").astype(int)

# Keeping only the first one in each, assuming it is the primary (based on very loose testing)
X.genres = X.genres.apply(eval)
X.genres = X.genres.apply(lambda x : x[0]["name"] if len(x) else "None")

X.production_companies = X.production_companies.apply(eval)
X.production_companies = X.production_companies.apply(lambda x : x[0]["name"] if len(x) else "None")

X.production_countries = X.production_countries.apply(eval)
X.production_countries = X.production_countries.apply(lambda x : x[0]["name"] if len(x) else "None")

# creating a binary column: 
X["has_collection"] = (X.belongs_to_collection != "No collection").astype(int) # has collection or not
X["country"] = (X.production_countries == "United States of America").astype(int) # produced in the US or not

top10 = X.production_companies.value_counts()
top10 = top10[:12]
top10 = top10.index
top10 = top10.drop("None")

X["production"] = X.production_companies.apply(lambda x: (x in top10)) # top10 production companies or not
X.production = X.production.astype(int) # cannot do .astype(int) in apply lambda for some reason

X["language"] = X.original_language.apply(lambda x: (x == "en")) # if original language was enlgish or not
X.language = X.language.astype(int) # same reason as above

X["for_adults"] = X.adult.apply(lambda x: 1 if x == 1 else 0)

X = X.join(pd.get_dummies(X.genres, drop_first = True))
X = X.join(pd.get_dummies(X.status, drop_first = True))

# dropping columns that are no longer needed
X = X.drop(columns = ["belongs_to_collection", "production_countries", "production_companies", "genres", "spoken_languages",
                     "adult", "original_language", "status", "revenue", "release_date", "Rumored", "Released", "None"])

In [10]:
X.head()

Unnamed: 0,budget,runtime,modern,has_collection,country,production,language,for_adults,Adventure,Animation,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,30000000,81.0,0,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,65000000,104.0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,16000000,127.0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,60000000,170.0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,35000000,106.0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
X = sm.add_constant(X, has_constant = 'add')

In [12]:
y = df.revenue

# revisions: 
X = X.drop(columns = ["modern", "production", "language", "for_adults"]) 
X = X.drop(columns = ["Crime","Fantasy", "Horror", "Mystery", "Science Fiction", "Thriller", "War", "Western", "Documentary",
                        "Foreign", "History", "Music", "TV Movie"]) 

est = sm.OLS(y, X).fit()
est.summary()

0,1,2,3
Dep. Variable:,revenue,R-squared:,0.55
Model:,OLS,Adj. R-squared:,0.549
Method:,Least Squares,F-statistic:,655.4
Date:,"Sat, 30 Jan 2021",Prob (F-statistic):,0.0
Time:,12:50:09,Log-Likelihood:,-107280.0
No. Observations:,5380,AIC:,214600.0
Df Residuals:,5369,BIC:,214700.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.017e+08,8.86e+06,-11.478,0.000,-1.19e+08,-8.43e+07
budget,2.6004,0.042,62.214,0.000,2.518,2.682
runtime,6.684e+05,7.46e+04,8.955,0.000,5.22e+05,8.15e+05
has_collection,8.057e+07,3.79e+06,21.268,0.000,7.31e+07,8.8e+07
country,1.224e+07,3.21e+06,3.810,0.000,5.94e+06,1.85e+07
Adventure,1.973e+07,6.04e+06,3.269,0.001,7.9e+06,3.16e+07
Animation,6.987e+07,9.71e+06,7.199,0.000,5.08e+07,8.89e+07
Comedy,1.697e+07,4.19e+06,4.045,0.000,8.74e+06,2.52e+07
Drama,9.65e+06,4.01e+06,2.408,0.016,1.79e+06,1.75e+07

0,1,2,3
Omnibus:,2531.73,Durbin-Watson:,1.937
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2277941.765
Skew:,-0.749,Prob(JB):,0.0
Kurtosis:,103.795,Cond. No.,515000000.0


# 2.2 Movies Manual Regression

Use your `X` and `y` matrix from 2.1 to calculate the linear regression yourself using the normal equation $(X^T X)^{-1}X^Ty$.

Verify that the coefficients are the same.

In [13]:
X = X.drop(columns = ["const"])

In [14]:
np.linalg.inv(X.T @ X) @ X.T @ y

0    2.659555e+00
1   -1.220905e+05
2    7.293247e+07
3   -1.104919e+05
4    1.501562e+07
5    4.710976e+07
6    5.548095e+06
7    7.675802e+06
8    6.153250e+07
9    2.234696e+07
dtype: float64

In [24]:
# The coefficient for runtime is **VERY** off, the rest are close but not an identical match.

# 2.3 Movies gradient descent regression

Use your `X` and `y` matrix from 2.1 to calculate the linear regression yourself using **gradient descent**. 

Hint: use `scipy.optimize` and remember we're finding the $\beta$ that minimizes the squared loss function of linear regression: $f(\beta) = (\beta X - y)^2$. This will look like part 3 of this lecture.

Verify your coefficients are similar to the ones in 2.1 and 2.2. They won't necessarily be exactly the same, but should be roughly similar.

In [15]:
X = X.to_numpy()
y = y.reset_index(drop = True)

In [27]:
def squaredLoss(betas, y, x):
    result = 0
    
    for i in range(0, len(y)):
        xb = np.dot(x[i], betas)
        llf = (xb - y[i])  ** 2
        result += llf
    return result

In [28]:
bhat = np.zeros(len(X[0]))
probit_est = opt.minimize(squaredLoss, bhat, args=(y,X), method='nelder-mead')

In [33]:
probit_est['x']

array([ 2.89307841, -1.55767358,  0.08871133,  0.54729502, -0.71134332,
       -0.00400039, -0.60025477,  1.41786171, -0.65935014, -0.39338817])