In [None]:
Exercise 3.1

Build a linear model that uses only a constant term (a column of ones) to predict a continuous outcome (like domestic total gross). How can you interpret the results of this model? What does it predict? Make a plot of predictions against actual outcome. Make a histogram of residuals. How are the residuals distributed?
Exercise 3.2

Repeat the process of challenge one, but also add one continuous (numeric) predictor variable. Also add plots of model prediction against your feature variable and residuals against feature variable. How can you interpret what's happening in the model?
Exercise 3.3

Repeat the process of challenge 1, but add a categorical feature (like genre). You'll have to convert a column of text into a number of numerical columns ("dummy variables"). How can you interpret what's happening in the model?
Exercise 3.4

Enhance your model further by adding more features and/or transforming existing features. Think about how you build the model matrix and how to interpret what the model is doing.
Exercise 3.5

Fitting and checking predictions on the exact same data set can be misleading. Divide your data into two sets: a training and a test set (roughly 75% training, 25% test is a fine split). Fit a model on the training set, check the predictions (by plotting versus actual values) in the test set.

## 3.1

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.cross_validation import train_test_split
from datetime import datetime

%matplotlib inline

In [None]:
data = pd.read_csv('2013_movies.csv')
data = data.dropna()
data['const_var'] = 1

In [None]:
X = data['const_var']
y = data['DomesticTotalGross']
olsmod = sm.OLS(y, X)
ols_res = olsmod.fit()
ypred = ols_res.predict(X)
print(ols_res.summary())

How can you interpret the results of this model?

The R^2 value tells us that there is no relationship at all between the constant var and DomesticTotalGrpss.
DomesticTotalGross increases relative to the constant, which remains flat.

What does it predict?

It predicts an increase in y without any change in x.

In [None]:
fig, ax = plt.subplots()
ax.scatter(y, ypred, alpha = .3)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=3)
ax.set_title('Predicted vs. Actual')
ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
sns.plt.hist(olsres.resid, bins=(round(len(olsres.resid)*0.33,0)));
plt.title('Resduals');
olsres.resid.plot()

## 3.2

In [None]:
data2 = data[['DomesticTotalGross', 'const_var','Budget']].reset_index()
data2.info()

In [None]:
X2 = data[['Budget', 'const_var' ]]
y2 = data['DomesticTotalGross']
olsmod = sm.OLS(y2, X2)
olsres2 = olsmod.fit()
ypred2 = olsres2.predict(X2)
print(olsres2.summary())

In [None]:
fig, ax = plt.subplots()
ax.scatter(y2, ypred2, alpha = .3)
ax.plot([y2.min(), y2.max()], [y2.min(), y2.max()], 'r--', lw=2)
ax.set_title('Predicted vs. Actuals')
ax.set_xlabel('Actuals')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
sns.plt.hist(olsres2.resid, bins = round(len(olsres2.resid)*0.33,0));
plt.title('Resduals');


In [None]:
ax = plt.gca()
res = olsres2.resid # residuals
fig = sm.qqplot(res,ax=ax, fit=True, line='r')
plt.show()

In [None]:
#The new beta coefficient has a stronger correlation and we can see a relationship emerging.

## 3.3

In [None]:
data3 = data[['const_var','Rating','DomesticTotalGross']].reset_index()
data3 = pd.concat([data3, pd.get_dummies(data3['Rating'])], axis=1)
data3 = data3.drop(['Rating'],axis=1)
data3.info()

In [None]:
X3 = data3[['PG-13','R', 'const_var']]
y3 = data3['DomesticTotalGross']
olsmod = sm.OLS(y3, X3)
olsres3 = olsmod.fit()
ypred3 = olsres3.predict(X3)
print(olsres3.summary())

In [None]:
fig, ax = plt.subplots()
ax.scatter(y3, ypred3, alpha = .3)
ax.plot([y3.min(), y3.max()], [y3.min(), y3.max()], 'r--', lw=2)
ax.set_title('Predicted vs. Actual')
ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')
plt.show()

In [None]:
sns.plt.hist(olsres3.resid, bins = round(len(olsres3.resid)*.33,0));
plt.title('Resduals');

## 3.4

In [None]:
X = data[['Budget','Runtime','ReleaseDate']]
y = data['DomesticTotalGross']
X['Budget'] = np.sqrt(data['Budget'])
X['month'] = pd.DatetimeIndex(X['ReleaseDate']).month
# X = pd.concat([X, pd.get_dummies(X['month'])], axis=1)
X = pd.concat([X, pd.get_dummies(data['Rating'])], axis=1)
X = X.drop(['ReleaseDate'], axis=1)
X = X.drop(['month'], axis=1)

# Initialize linear model
lr = linear_model.LinearRegression()
lr_fit = lr.fit(X, y)
# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation:
predicted = cross_val_predict(lr, X, y, cv=10)

fig, ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=3)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

## 3.5

In [None]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
model_lr = linear_model.LinearRegression(fit_intercept=True, normalize=True)
model_lr.fit(X_train, y_train)
pred_vals_lr = model_lr.predict(X_test)
RMSE_lr = np.sqrt(np.sum((pred_vals_lr - y_test)**2)/len(y_test))


print 'Train set: %.2f' % model_lr.score(X_train,y_train)
print 'Test set: %.2f' % model_lr.score(X_test,y_test)