In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast

In [None]:
# Read in train csv
df = pd.read_csv("Resources/train.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#Fill blank runtime entries with zero value
df["runtime"]=df["runtime"].fillna(0)

In [None]:
df['original_language'] = [1 if x == 'en' else 0 for x in df['original_language']]

In [None]:
df.info()

In [None]:
# Change homepage and belongs_to_collection columns to yes or no format
df.loc[df["homepage"].notnull(),"homepage"]=1
df["homepage"]=df["homepage"].fillna(0)

df.loc[df["belongs_to_collection"].notnull(),"belongs_to_collection"]=1
df["belongs_to_collection"]=df["belongs_to_collection"].fillna(0)

In [None]:
# Add column whether the movie has a tagline or not
df["Has_tagline"]=1
df.loc[df["tagline"].isnull(), "Has_tagline"]=0

In [None]:
# create variable for columns needing unpacked from json object
need_strings=["genres", 
              "production_companies", 
              "production_countries"]

In [None]:
# for loop to pull out just the name id values from the json columns
for string in need_strings:
    df.loc[df[string].notnull(),string]=\
    df.loc[df[string].notnull(),string]\
    .apply(lambda x : ast.literal_eval(x))\
    .apply(lambda x : [y["name"] for y in x])

In [None]:
# formatting cast and crew columns to count the arrays in the column
df.loc[df["cast"].notnull(),"cast"]=df.loc[df["cast"].notnull(),"cast"].apply(lambda x : ast.literal_eval(x))
df.loc[df["crew"].notnull(),"crew"]=df.loc[df["crew"].notnull(),"crew"].apply(lambda x : ast.literal_eval(x))

# create new columns with the count of cast and crew members
df["cast_count"] = df.loc[df["cast"].notnull(),"cast"].apply(lambda x : len(x))
df["crew_count"] = df.loc[df["crew"].notnull(),"crew"].apply(lambda x : len(x))

In [None]:
#Fill empty rows with 0
df["genres"]=df["genres"].fillna(0)
df.head()

df["cast_count"]=df["cast_count"].fillna(0)
df.head()

df["crew_count"]=df["crew_count"].fillna(0)
df.head()


In [None]:
#Use get dummies to change genres into 0s and 1s and merge to original dataframe
df=pd.get_dummies(df['genres'].apply(pd.Series).stack()).sum(level=0).merge(df, left_index=True, right_index=True)
df.head()

In [None]:
df.info()

In [None]:
# create X and y variables for random forest regressor
X=df[["Action",
      "Adventure",
      "Animation",
      "Comedy",
      "Crime",
      "Documentary",
      "Drama",
      "Family",
      "Fantasy",
      "Foreign",
      "History",
      "Horror",
      "Music",
      "Mystery",
      "Romance",
      "Science Fiction",
      "Thriller",
      "TV Movie",
      "War",
      "Western",
      "budget",
      "popularity",
      "runtime",
      "original_language",
      "homepage",
      "Has_tagline",
      "belongs_to_collection",
      "cast_count",
      "crew_count"]]
print(X.head())

In [None]:
y = df["revenue"]


In [None]:
# check the shape of the X and y
print(X.shape, y.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

In [None]:
#Train, test, split the train dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
#Import RandomForest Regressor
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=200, n_jobs=1)
rf = rf.fit(X_train, y_train)


In [None]:
# Check score of random forest
rf.score(X_test, y_test)

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

In [None]:
#Import Linear Regression model
from sklearn import linear_model
model = linear_model.LinearRegression()

In [None]:
x = df[["budget"]]


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [None]:
print(x.shape, y.shape)

In [None]:
#fit and score the data
model.fit(x_train,y_train)
model.score(x_test,y_test)

In [None]:
# check predictions against actual revenue for random forest
predict = rf.predict(X_test)
pd.DataFrame({"Prediction": predict.astype(int), "Actual": y_test})

In [None]:
#read in test csv file
test = pd.read_csv("Resources/test.csv")
test.head()

In [None]:
#Fill blank runtime entries with zero value
test["runtime"]=test["runtime"].fillna(0)

# Change original language to if english or not
test['original_language'] = [1 if x == 'en' else 0 for x in test['original_language']]

# Change homepage column to yes or no format
test.loc[test["homepage"].notnull(),"homepage"]=1
test["homepage"]=test["homepage"].fillna(0)

# Add column whether the movie has a tagline or not
test["Has_tagline"]=1
test.loc[test["tagline"].isnull(), "Has_tagline"]=0

# Change belongs_to_collection to yes or no format
test.loc[test["belongs_to_collection"].notnull(),"belongs_to_collection"]=1
test["belongs_to_collection"]=test["belongs_to_collection"].fillna(0)



In [None]:
need_strings=["genres", 
              "production_companies", 
              "production_countries"]

In [None]:
for string in need_strings:
    test.loc[test[string].notnull(),string]=\
    test.loc[test[string].notnull(),string]\
    .apply(lambda x : ast.literal_eval(x))\
    .apply(lambda x : [y["name"] for y in x])

In [None]:
test.loc[test["cast"].notnull(),"cast"]=test.loc[test["cast"].notnull(),"cast"].apply(lambda x : ast.literal_eval(x))
test.loc[test["crew"].notnull(),"crew"]=test.loc[test["crew"].notnull(),"crew"].apply(lambda x : ast.literal_eval(x))

test["cast_count"] = test.loc[test["cast"].notnull(),"cast"].apply(lambda x : len(x))
test["crew_count"] = test.loc[test["crew"].notnull(),"crew"].apply(lambda x : len(x))

In [None]:
# fill rows with blanks to 0s
test["genres"]=test["genres"].fillna(0)
test.head()

test["cast_count"]=test["cast_count"].fillna(0)
test.head()

test["crew_count"]=test["crew_count"].fillna(0)
test.head()

In [None]:
# Use get dummies to change genres to 0s and 1s and merge to test dataframe
test=pd.get_dummies(test['genres'].apply(pd.Series).stack()).sum(level=0).merge(test, left_index=True, right_index=True)
test.head()

In [None]:
#test CSV did not have any TV Movie genres so added an empty column
test["TV Movie"]=0

In [None]:
# create x and y variables as features and target for test dataframe
features=test[["Action",
      "Adventure",
      "Animation",
      "Comedy",
      "Crime",
      "Documentary",
      "Drama",
      "Family",
      "Fantasy",
      "Foreign",
      "History",
      "Horror",
      "Music",
      "Mystery",
      "Romance",
      "Science Fiction",
      "Thriller",
      "TV Movie",
      "War",
      "Western",
      "budget",
      "popularity",
      "runtime",
      "original_language",
      "homepage",
      "Has_tagline",
      "belongs_to_collection",
      "cast_count",
      "crew_count"]]

target='revenue'

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(features)
features = scaler.transform(features)

In [None]:
test.info()

In [None]:
# create predictions variable for linear regression
predictions = model.predict(test[["budget"]])
predictions

In [None]:
# create predictions variable for random forest
rf_predicts = rf.predict(features)
rf_predicts

In [None]:
# create dataframe with predicted revenues for rf and linear regression along with movie title and id
predict_df = pd.DataFrame()
predict_df["id"]=test["id"]
predict_df["title"] = test["title"]
predict_df["lin_revenue"] = predictions.astype(int)
predict_df["rf_revenue"] = rf_predicts.astype(int)

In [None]:
#sort the predictions dataframe
predict_df = predict_df.sort_values(by=['rf_revenue'], ascending=False)
predict_df