In [635]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from matplotlib import pyplot as plt

In [636]:
md = pd.read_csv('clean_metadata.csv')
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5368 entries, 0 to 5367
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             5368 non-null   int64  
 1   belongs_to_collection  5368 non-null   int64  
 2   budget                 5368 non-null   float64
 3   genres                 5368 non-null   object 
 4   id                     5368 non-null   int64  
 5   original_language      5368 non-null   object 
 6   overview               5368 non-null   object 
 7   popularity             5368 non-null   float64
 8   production_companies   5368 non-null   object 
 9   production_countries   5368 non-null   object 
 10  release_date           5368 non-null   object 
 11  revenue                5368 non-null   float64
 12  runtime                5368 non-null   float64
 13  spoken_languages       5368 non-null   object 
 14  title                  5368 non-null   object 
 15  vote

Now, we want to take out the budget, runtime, and language data and plot those data with respect to revenue. 

In [637]:
rg1 = md.copy(deep=True)
column_list = [0, 3, 4, 6, 7, 8, 9, 10, 13, 14, 15, 16]
rg1 = rg1.drop(columns=rg1.columns[column_list])
column_order = ['revenue', 'belongs_to_collection','budget', 'original_language', 'runtime']
rg1 = rg1.reindex(columns=column_order)

In [638]:
rg1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5368 entries, 0 to 5367
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   revenue                5368 non-null   float64
 1   belongs_to_collection  5368 non-null   int64  
 2   budget                 5368 non-null   float64
 3   original_language      5368 non-null   object 
 4   runtime                5368 non-null   float64
dtypes: float64(3), int64(1), object(1)
memory usage: 209.8+ KB


We now perform some feature engineering. First, we break apart the original_language column into column specifying the exact language. 

In [639]:
rg1['en'] = rg1['original_language'].apply(lambda x: 1 if x=='en' else 0)
rg1 = rg1.drop(['original_language'], axis=1)

Most importantly, we add a column 'return' that calculates the ratio of return for a movie (revenue/budget). 

In [640]:
rg1['return'] = rg1['revenue'] / rg1['budget']

We now also create a column 'is_flop' from the new 'return' column, where 'is_flop' is 1 when 'return' < 1, and is 0 otherwise.

In [641]:
rg1['is_flop'] = rg1['return'].apply(lambda x: 1 if x < 1 else 0)
rg1.head().transpose()

Unnamed: 0,0,1,2,3,4
revenue,373554000.0,262797200.0,81452160.0,187436800.0,64350170.0
belongs_to_collection,1.0,0.0,0.0,0.0,0.0
budget,30000000.0,65000000.0,16000000.0,60000000.0,35000000.0
runtime,81.0,104.0,127.0,170.0,106.0
en,1.0,1.0,1.0,1.0,1.0
return,12.4518,4.043035,5.09076,3.123947,1.838576
is_flop,0.0,0.0,0.0,0.0,0.0


We check to see how many flops there are compared to not flops. 

In [642]:
rg1[rg1['is_flop']==0].shape

(3770, 7)

In [643]:
rg1[rg1['is_flop']==1].shape

(1598, 7)

In [644]:
rg1 = rg1.drop(columns=['en', 'runtime', 'belongs_to_collection', 'budget'])

We convert the pandas dataframe to a numpy array.

In [645]:
data = rg1.to_numpy()

In [646]:
X, y = data[:, 1:-2], data[:, -1]

'''# add polynomial features to X
pol = PolynomialFeatures(degree=4)
Xp = pol.fit_transform(X)'''

# create testing and training datasets
Xp_train, Xp_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# normalize X_train, X_test
scaler = StandardScaler()
Xp_train = scaler.fit_transform(X_train)
Xp_test = scaler.transform(X_test)

# run logistic regression
model = LogisticRegression(multi_class='ovr')
model.fit(Xp_train, y_train)

y_pred = model.predict(Xp_test)
print(accuracy_score(y_test, y_pred))

0.7001862197392924
