In [1]:
import numpy as np
from catboost import Pool, CatBoostRegressor
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
# Regressor example

# initialize data
train_data = np.random.randint(0, 100, size=(100, 10))
train_label = np.random.randint(0, 1000, size=(100))
test_data = np.random.randint(0, 100, size=(50, 10))
# initialize Pool
train_pool = Pool(train_data, train_label, cat_features=[0,2,5])
test_pool = Pool(test_data, cat_features=[0,2,5]) 

# specify the training parameters 
model = CatBoostRegressor(iterations=2, depth=2, learning_rate=1, loss_function='RMSE')
#train the model
model.fit(train_pool)
# make the prediction using the resulting model
preds = model.predict(test_pool)
model.score(train_data,train_label)

0:	learn: 295.8825996	total: 47ms	remaining: 47ms
1:	learn: 289.8411978	total: 47.5ms	remaining: 0us


289.8411977876975

In [4]:
metrics = ['RMSE','Logloss','MAE','CrossEntropy','Quantile','LogLinQuantile','SMAPE','MultiClass','MultiClassOneVsAll',
           'MAPE','Poisson','PairLogit','QueryRMSE','QuerySoftMax','Recall','Precision','F1','TotalF1','Accuracy',
           'AUC','R2','MCC','PairAccuracy','QueryAverage','PFound','CtrFactor']


In [5]:
# Initialize data
cat_features = [0,1,2]
train_data = [["a","b",1,4,5,6],["a","b",4,5,6,7],["c","d",30,40,50,60]]
test_data = [["a","b",2,4,6,8],["a","d",1,4,50,60]]
train_labels = [10,20,30]
# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=2, learning_rate=1, depth=2)
# Fit model
model.fit(train_data, train_labels, cat_features)
# Get predictions
preds = model.predict(test_data)
print(preds)

0:	learn: 13.6167789	total: 470us	remaining: 470us
1:	learn: 8.2600948	total: 885us	remaining: 0us
[16.25 16.25]


In [2]:
data = pd.read_csv("../Result_Data/total_engineered.csv")
del data["Unnamed: 0"]
data= data.fillna(data.mean())
X = data[['Actor_0', 'Actor_1', 'Actor_2', 'Actor_3', 'Actor_4', 'Actor_5', 'Actor_6', 'Actor_7', 'Actor_8', 'Actor_9', 
           'Budget', 'Directors', 'Release Date', 'Runtime (mins)', 'Title', 'Year', 'Genre: Short', 'Genre:  Comedy', 
           'Genre: Fantasy', 'Genre: Film-Noir', 'Genre: War', 'Genre: Musical', 'Genre:  Sport', 'Genre: Biography', 
           'Genre: Action', 'Genre:  Fantasy', 'Genre:  Animation', 'Genre:  Biography', 'Genre: Mystery', 
           'Genre:  Musical', 'Genre:  Romance', 'Genre: Thriller', 'Genre:  Film-Noir', 'Genre:  History', 
           'Genre: Western', 'Genre: Drama', 'Genre: Sci-Fi', 'Genre:  Horror', 'Genre: Romance', 'Genre: Adventure', 
           'Genre:  Family', 'Genre:  Sci-Fi', 'Genre: Animation', 'Genre:  Music', 'Genre: Music', 'Genre: History', 
           'Genre:  Mystery', 'Genre:  Thriller', 'Genre: Comedy', 'Genre:  Crime', 'Genre: Horror', 'Genre:  Drama', 
           'Genre:  War', 'Genre:  Western', 'Genre:  Adventure', 'Genre: Family', 'Genre:  Action', 'Genre: Crime', 
           'Content Rating: PASSED', 'Content Rating: TV-MA', 'Content Rating: X', 'Content Rating: NC-17', 
           'Content Rating: TV-14', 'Content Rating: M', 'Content Rating: GP', 'Content Rating: TV-PG', 
           'Content Rating: PG', 'Content Rating: PG-13', 'Content Rating: G', 'Content Rating: NR', 
           'Content Rating: APPROVED', 'Content Rating: UNRATED', 'Content Rating: M/PG', 'Content Rating: TV-13', 
           'Content Rating: NOT RATED', 'Content Rating: TV-G', 'Content Rating: R', 'Decade',  'Budget_Adjusted',  
           'Length of Title', 'Directors Prev Number Movies', 'Directors Prev Mean Profit', 'Directors Prev Mean IMDb',
           'Directors Prev Mean Meta', 'Directors Prev Mean Num Votes', 'Directors Prev Mean Nominations', 
           'Directors Prev Mean Wins', 'Actor Weights']].select_dtypes(include=['float64','int','bool']).astype('float')
y = data[['Gross', 'IMDb Rating', 'Meta Score', 'Num Votes', 'Oscar Nominations', 'Oscar Wins', 'Other Nominations', 
           'Other Wins','Profit','Gross_Adjusted',  'Profit_Adjusted', 'Profit_Bool', 'Total Nominations', 'Total Wins']].select_dtypes(include=['float64','int','bool']).astype('float')

In [6]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3)


In [7]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=3, learning_rate=.3, 
                          depth=3,l2_leaf_reg=64)
model.get_params()

{'depth': 3,
 'iterations': 3,
 'l2_leaf_reg': 64,
 'learning_rate': 0.3,
 'loss_function': 'RMSE'}

In [8]:
es_y = np.zeros_like(y_test)

for i in range(14):
    # Fit model
    model.fit(X_train, y_train.iloc[0:,i],verbose=False)
    es_y[0:,i] = model.predict(X_test)

In [9]:
# Get predictions
tt_y = y_test

print("\nMovie Gross Average Percent Error:")
print((abs(es_y[0:,0]-np.array(tt_y.astype(float))[0:,0])/abs(np.array(tt_y.astype(float))[0:,0])).mean()*100,"%")
print("\nIMDb Rating Average Percent Error:")
print((abs(es_y[0:,1]-np.array(tt_y.astype(float))[0:,1])/abs(np.array(tt_y.astype(float))[0:,1])).mean()*100,"%")
print("\nMeta Score Average Percent Error:")
print((abs(es_y[0:,2]-np.array(tt_y.astype(float))[0:,2])/abs(np.array(tt_y.astype(float))[0:,2])).mean()*100,"%")
print("\nNumber of Votes Average Percent Error:")
print((abs(es_y[0:,3]-np.array(tt_y.astype(float))[0:,3])/abs(np.array(tt_y.astype(float))[0:,3])).mean()*100,"%")
print("\nOscar Nominations Average Error:")
print(abs(es_y[0:,4]-np.array(tt_y.astype(float))[0:,4]).mean())
print("\nOscar Wins Average Error:")
print(abs(es_y[0:,5]-np.array(tt_y.astype(float))[0:,5]).mean())
print("\nOther Nominations Average Error:")
print(abs(es_y[0:,6]-np.array(tt_y.astype(float))[0:,6]).mean())
print("\nOther Wins Average Error:")
print(abs(es_y[0:,7]-np.array(tt_y.astype(float))[0:,7]).mean())
print("\nProfit Average Percent Error:")
print((abs(es_y[0:,8]-np.array(tt_y.astype(float))[0:,8])/abs(np.array(tt_y.astype(float))[0:,8])).mean()*100,"%")
print("\nGross Adjusted Average Percent Error:")
print((abs(es_y[0:,9]-np.array(tt_y.astype(float))[0:,9])/abs(np.array(tt_y.astype(float))[0:,9])).mean()*100,"%")
print("\nProfit Adjusted Average Percent Error:")
print((abs(es_y[0:,10]-np.array(tt_y.astype(float))[0:,10])/abs(np.array(tt_y.astype(float))[0:,10])).mean()*100,"%")
print("\nProfit Bool Average Error:")
print(abs(es_y[0:,11]-np.array(tt_y.astype(float))[0:,11]).mean())
print("\nTotal Nominations Average Error:")
print(abs(es_y[0:,12]-np.array(tt_y.astype(float))[0:,12]).mean())
print("\nTotal Wins Average Error:")
print(abs(es_y[0:,13]-np.array(tt_y.astype(float))[0:,13]).mean())


Movie Gross Average Percent Error:
70624.98290236352 %

IMDb Rating Average Percent Error:
34.78105812494239 %

Meta Score Average Percent Error:
37.03139814594714 %

Number of Votes Average Percent Error:
121.97682393614116 %

Oscar Nominations Average Error:
0.3245159800342501

Oscar Wins Average Error:
0.1851303821285033

Other Nominations Average Error:
7.631405849030134

Other Wins Average Error:
4.797607943279798

Profit Average Percent Error:
766.7767171280962 %

Gross Adjusted Average Percent Error:
54119.177317979236 %

Profit Adjusted Average Percent Error:
812.4398804399978 %

Profit Bool Average Error:
0.34760453890501325

Total Nominations Average Error:
7.861992118498511

Total Wins Average Error:
4.958356777651253


In [None]:
parameters = {'pca__n_components':[10,20,50,100],'catB__interations':[3], 'catB__learning_rate':[.3], 'catB__depth':[3], 'catB__loss_function':['RMSE'], 'catB__l2_leaf_reg':[64]}
pca = PCA()
catB = CatBoostRegressor()
pipe = Pipeline(steps=[('pca', pca), ('catB', catB)])
estimator = GridSearchCV(pipe, parameters, n_jobs = -1, verbose = 1)

es_y = np.zeros_like(tt_y)
for i in range(14):
    estimator.fit(tr_x,tr_y.iloc[0:,i],verbose=False)
    es_y[0:,i] = estimator.best_estimator.predict(tt_x)
print("\nMovie Gross Average Percent Error:")
print((abs(es_y[0:,0]-np.array(tt_y.astype(float))[0:,0])/abs(np.array(tt_y.astype(float))[0:,0])).mean()*100,"%")
print("\nIMDb Rating Average Percent Error:")
print((abs(es_y[0:,1]-np.array(tt_y.astype(float))[0:,1])/abs(np.array(tt_y.astype(float))[0:,1])).mean()*100,"%")
print("\nMeta Score Average Percent Error:")
print((abs(es_y[0:,2]-np.array(tt_y.astype(float))[0:,2])/abs(np.array(tt_y.astype(float))[0:,2])).mean()*100,"%")
print("\nNumber of Votes Average Percent Error:")
print((abs(es_y[0:,3]-np.array(tt_y.astype(float))[0:,3])/abs(np.array(tt_y.astype(float))[0:,3])).mean()*100,"%")
print("\nOscar Nominations Average Error:")
print(abs(es_y[0:,4]-np.array(tt_y.astype(float))[0:,4]).mean())
print("\nOscar Wins Average Error:")
print(abs(es_y[0:,5]-np.array(tt_y.astype(float))[0:,5]).mean())
print("\nOther Nominations Average Error:")
print(abs(es_y[0:,6]-np.array(tt_y.astype(float))[0:,6]).mean())
print("\nOther Wins Average Error:")
print(abs(es_y[0:,7]-np.array(tt_y.astype(float))[0:,7]).mean())
print("\nProfit Average Percent Error:")
print((abs(es_y[0:,8]-np.array(tt_y.astype(float))[0:,8])/abs(np.array(tt_y.astype(float))[0:,8])).mean()*100,"%")
print("\nGross Adjusted Average Percent Error:")
print((abs(es_y[0:,9]-np.array(tt_y.astype(float))[0:,9])/abs(np.array(tt_y.astype(float))[0:,9])).mean()*100,"%")
print("\nProfit Adjusted Average Percent Error:")
print((abs(es_y[0:,10]-np.array(tt_y.astype(float))[0:,10])/abs(np.array(tt_y.astype(float))[0:,10])).mean()*100,"%")
print("\nProfit Bool Average Error:")
print(abs(es_y[0:,11]-np.array(tt_y.astype(float))[0:,11]).mean())
print("\nTotal Nominations Average Error:")
print(abs(es_y[0:,12]-np.array(tt_y.astype(float))[0:,12]).mean())
print("\nTotal Wins Average Error:")
print(abs(es_y[0:,13]-np.array(tt_y.astype(float))[0:,13]).mean())

Fitting 3 folds for each of 4 candidates, totalling 12 fits
