# **FODS Final Review: What makes a movie profitable?**

### **Required Packages**

In [40]:
import numpy as np
import pandas as pd
import scipy.stats as st
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics

### **Read Dataset**

In [41]:
data=pd.read_csv("/content/profit_movies.csv")
data.drop('Unnamed: 0',axis=1,inplace=True)
data.head(3)

Unnamed: 0,adult,budget,genres,popularity,poster_path,production_countries,release_date,revenue,runtime,spoken_languages,title,video,vote_average,vote_count,profit
0,False,237000000,"['Action', 'Adventure', 'Fantasy', 'Science Fi...",185.070892,/kmcqlZGaSh20zpTbuoF0Cdn07dT.jpg,"['United States of America', 'United Kingdom']",2009-10-12,2787965087,162.0,"['English', 'Español']",Avatar,False,7.2,12114.0,2550965087
1,False,245000000,"['Action', 'Adventure', 'Science Fiction', 'Fa...",31.626013,/weUSwMdQIa3NaXVzwUoIIcAi85d.jpg,['United States of America'],2015-12-15,2068223624,136.0,['English'],Star Wars: The Force Awakens,False,7.5,7993.0,1823223624
2,False,200000000,"['Drama', 'Romance', 'Thriller']",26.88907,/kHXEpyfl6zqn8a6YuozZUujufXf.jpg,['United States of America'],1997-11-18,1845034188,194.0,"['English', 'Français', 'Deutsch', 'svenska', ...",Titanic,False,7.5,7770.0,1645034188


### **Dataset Overview**

In [42]:
data.describe()

Unnamed: 0,budget,popularity,revenue,runtime,vote_average,vote_count,profit
count,1543.0,1543.0,1543.0,1543.0,1543.0,1543.0,1543.0
mean,59985270.0,15.459826,255913700.0,113.968244,6.50674,1770.085548,195928500.0
std,54221630.0,23.525478,236457500.0,21.589158,0.767088,1813.188649,201898600.0
min,113.0,0.452934,51525170.0,69.0,3.2,2.0,50020090.0
25%,20000000.0,8.969442,112949600.0,98.0,6.0,550.5,76450180.0
50%,40000000.0,11.746053,174578800.0,110.0,6.5,1158.0,125362600.0
75%,80000000.0,15.693102,307009500.0,126.0,7.1,2308.0,230363100.0
max,380000000.0,547.488298,2787965000.0,238.0,9.1,14075.0,2550965000.0


In [43]:
rows,cols=data.shape
print("Rows:",rows,"Cols:",cols)

Rows: 1543 Cols: 15


In [44]:
per=(data.isnull().sum().sum()/rows)*100
print("Percentage of missing values:",round(per,3),"%")

Percentage of missing values: 0.0 %


### **Random Sample of 10 datapoints from the population**

In [45]:
sample=data.sample(30)
print("Sample Shape:",sample.shape)

Sample Shape: (30, 15)


### **Hypothesis 1**

* H0: Average Budget of a profitable movie =  60 Million (approx)
* Ha: Average Budget of a profitable movie != 60 Million
* Thus, a two-tailed test is required.

In [46]:
#Extract Budget from Sample
budget=sample.budget.values
print("Budget:",*budget)

Budget: 40000000 40000000 15000000 85000000 175000000 100000000 250000000 180000000 4000000 80000000 113 40000000 14000000 5000000 16000000 2500000 145000000 38000000 4800000 85000000 42000000 60000000 40000000 8000000 105000000 78000000 27000000 50000000 16000000 125000000


In [47]:
#Initialize Variables
n=len(budget)             # Number of DataPoints
mu=np.mean(budget)        # Sample Mean
xbar=np.mean(data.budget) # Population Mean
sigma=np.std(data.budget) # Population Standard Deviation
alpha=0.05                # Level of Significance
print(n,mu,xbar,sigma,alpha)

30 62343337.1 59985274.628645495 54204052.9925719 0.05


* n=10
* mu=62400000.0
* xbar=59985274.628645495
* sigma=54204052.9925719
* alpha=0.05

In [48]:
#Z and Z Critical Value 
Z=(xbar-mu)/(sigma/n**0.5)
print("Z value:",Z)

Z_Crit=abs(st.norm.ppf(alpha/2))
left=-1*Z_Crit
right=Z_Crit
print("Z Critical:",Z_Crit)
print("Left:",left,"Right:",right)

Z value: -0.23827812428421885
Z Critical: 1.9599639845400545
Left: -1.9599639845400545 Right: 1.9599639845400545


In [49]:
#Inferences
rejection=False
if(Z>right or Z<left): 
  rejection=True
else:
  rejection=False
print("Rejection:",rejection)

Rejection: False


It fails to reject null hypothesis. Hence, there is enough evidence to claim that **Average Budget of a profitable movie =  60 Million**.

### **Machine Learning Model: Prediction of Profit**

In [50]:
df=pd.read_csv('/content/clean_data_movies.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)
df.head(3)

Unnamed: 0,adult,budget,genres,popularity,poster_path,production_countries,release_date,revenue,runtime,spoken_languages,title,video,vote_average,vote_count,profit
0,False,30000000,"['Animation', 'Comedy', 'Family']",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,['United States of America'],1995-10-30,373554033,81.0,['English'],Toy Story,False,7.7,5415.0,343554033
1,False,65000000,"['Adventure', 'Fantasy', 'Family']",17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,['United States of America'],1995-12-15,262797249,104.0,"['English', 'Français']",Jumanji,False,6.9,2413.0,197797249
2,False,16000000,"['Comedy', 'Drama', 'Romance']",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,['United States of America'],1995-12-22,81452156,127.0,['English'],Waiting to Exhale,False,6.1,34.0,65452156


* Adult, Video needs encoding
* Genres, Poster Path, Release Date, Spoken Languages, Title needs to be handled.
* Data Normalization is required.

In [51]:
encoder=LabelEncoder()
print("Previous:",df['adult'].unique())
df['adult']=encoder.fit_transform(df['adult'])
print("Now:",df['adult'].unique())

encoder=LabelEncoder()
print("Previous:",df['video'].unique())
df['video']=encoder.fit_transform(df['video'])
print("Now:",df['video'].unique())

Previous: [False]
Now: [0]
Previous: [False]
Now: [0]


In [52]:
cols=['genres','poster_path','production_countries','release_date','spoken_languages','title']
df.drop(cols,inplace=True,axis=1)
df.head(3)

Unnamed: 0,adult,budget,popularity,revenue,runtime,video,vote_average,vote_count,profit
0,0,30000000,21.946943,373554033,81.0,0,7.7,5415.0,343554033
1,0,65000000,17.015539,262797249,104.0,0,6.9,2413.0,197797249
2,0,16000000,3.859495,81452156,127.0,0,6.1,34.0,65452156


In [53]:
#X and Y split
X=df.iloc[:,:-1]
Y=df.iloc[:,-1]

In [54]:
#Train Test Split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.25,random_state=1)

In [55]:
#Data Normalization
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)

In [56]:
#Linear Regression
regressor=LinearRegression()
regressor=regressor.fit(X_train,Y_train)
Y_pred=regressor.predict(X_test)

In [57]:
#Metrics Report
print("RMSE:",round(metrics.mean_squared_error(Y_test,Y_pred)**0.5,10))
print("R2:",metrics.r2_score(Y_test,Y_pred))

RMSE: 7.92e-08
R2: 1.0
