## Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
%matplotlib inline
plt.style.use('ggplot')

In [None]:
pd.set_option('display.max_columns', None)

#### Reading in the Data

In [None]:
train=pd.read_csv(r'train.csv')
test=pd.read_csv(r'test.csv')

train.head()

In [None]:
test.head()

After taking an initial glance at the data we can see certain columns are in JSON or DIctionary format so I shall go ahead
and take a look at the datatypes that we have for the columns

In [None]:
train.info()

In [None]:
type(train['production_countries'][0])

The columns were originally in json format but due to the formatting of the csv file were imported as strings into the dataframe, we need to turn the strings back into lists and dictionaries.

In [None]:
from ast import literal_eval

Storing the variables which contain the Dictionary format Data.

In [None]:
dict_types = ['belongs_to_collection','genres','production_countries','spoken_languages','production_companies','Keywords','cast','crew']

Function to converts Columns to Dictionary formats

In [None]:
def to_dict(data, cols):
    '''
    Function to convert string format into dict and list format
    Args: data = dataframe
          cols = list; header names
    Returns: data = dataframe
    '''
    for col in cols:
        data[col] = data[col].apply(lambda x: {} if pd.isna(x) else literal_eval(x))
        
    return data

train = to_dict(train, dict_types)
test = to_dict(test, dict_types)

In [None]:
train.dtypes

Function to convert the Dictionary formatted variables to lists of the value contained in the 'name' key since that is of interest in our analysis

In [None]:
def dic_to_vals(data, cols):
    '''
    Function to turn values of dictionary variables into lists of the value of 'name'
    Args: dataset = dataframe
          headers = list; column headers whose contents need to be transformed
    Returns: dataset = dataframe
    '''
    #Creating temporary columns to store the formatted lists
    temp_names = list(map(lambda x: x+'_temp', cols))
    for head in temp_names:
        data[head] = 0
    
    #Convert dicts to lists containing only the value in 'name' key
    j=0
    for org_col in cols:
        data[temp_names[j]] = data[org_col].apply(lambda x: ','.join(i['name'] for i in x) if x != {} else '')
        j+=1
    
    return data

train = dic_to_vals(train, dict_types)
test = dic_to_vals(test, dict_types)

train.head()

Checking shape of New Datframes with added columns

In [None]:
train.shape

In [None]:
test.shape

Checking for null values

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

## Exploratory Data Analysis

In [None]:
sns.jointplot(x="budget", y="revenue", data=train, height=11, ratio=4, color="g")
plt.show()

We can see that Budget and Revenue have somewhat positive correlation however not very strong.

In [None]:
train.columns

In [None]:
sns.jointplot(x="popularity", y="revenue", data=train, height=11, ratio=4, color="g")
plt.show()

Popularity quite interestingly also doesn't show a strong correlation with revenue which is quite contradictory 

In [None]:
train['genres_temp'].value_counts().head(5)

Taking a look at the top most popular genres in which movies have been produced

In [None]:
plt.figure(figsize=(10,9))
sns.barplot(x='genres_temp', y='revenue', data=train[train['genres_temp'].isin(['Drama','Comedy','Drama,Romance','Comedy,Romance','Comedy,Drama'])])
plt.show()

The barplot above depicts the revenue generated by the most popular Genres and Comedy and Romance seem to dominate this

In [None]:
sns.jointplot(x="runtime", y="revenue", data=train, height=11, ratio=4, color="g")
plt.show()

A jointplot of Runtime and Rrevenue shows not positive correlation and also shows that most movies have a mean runtime of about 100 to 150 minutes

In [None]:
plt.figure(figsize=(10,9))
sns.distplot(train.revenue)

The distribution of the revenue column seems to be quite right skewed to we shall use a Log-Transformation to make it resemble a normal Distribution

In [None]:
train['log_revenue']=np.log(train['revenue'])
sns.kdeplot(train.log_revenue)

In [None]:
train[['release_month','release_date','release_year']]=train['release_date'].str.split('/', expand=True).replace(np.nan, -1).astype('int')

train.head()

Splitting Release Data column such that Date, Month and Year and store them in seperate columns

In [None]:
train.dtypes

In [None]:
train['release_year'] = train['release_year'].apply(lambda x: x+1900 if x>19 else x+2000).astype(int)

train['release_year'].head()

As the Release Year column is a little ambiguous since we dont know if its for 20th Century or 21st Century I have converted the column to show full year instead of last 2 digits

In [None]:
train.head()

In [None]:
plt.figure(figsize=(20,12))
sns.countplot(train['release_year'])
plt.title("Movie Release count by Year",fontsize=20)
#loc, labels = plt.xticks()
plt.xticks(fontsize=12,rotation=90)
plt.show()

Taking a look at the frequency of movies produced every year and 2013 seems to have the highest count. Its also interesting to note that there is strong upward curve of movie productions since 1976

In [None]:
plt.figure(figsize=(20,12))
sns.countplot(train['release_month'])
plt.title("Movie Release count by Month",fontsize=20)
#loc, labels = plt.xticks()
plt.xticks(fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(20,12))
sns.countplot(train['release_date'])
plt.title("Movie Release count by Day",fontsize=20)
#loc, labels = plt.xticks()
plt.xticks(fontsize=12)
plt.show()

In [None]:
train.columns

In [None]:
train['release_fulldate'] = train[['release_date','release_month','release_year']].apply(lambda x: '/'.join(x.values.astype(str)), axis=1)

train.head()

Recreating the release full-date column which I had replaced earlier to do certain time based analysis down the line

In [None]:
train['release_fulldate'] = pd.to_datetime(train['release_fulldate'])

In [None]:
import datetime as dt

In [None]:
train['day_of_week'] = train['release_fulldate'].dt.dayofweek
train['release_quarter'] = train['release_fulldate'].dt.quarter

Extracting Day of week and Quarter of year to further analyse and gain insight about how movies ae released and what time is optimal

In [None]:
plt.figure(figsize=(20,12))
sns.countplot(train['day_of_week'])
plt.title("Movie Release count by DayOFWeek",fontsize=20)
#loc, labels = plt.xticks()
plt.xticks(fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(train['release_quarter'])
plt.title("Movie Release count by Quarter",fontsize=20)
#loc, labels = plt.xticks()
plt.xticks(fontsize=12)
plt.show()

In [None]:
train.columns

In [None]:
d1 = train.groupby('release_year')['revenue'].agg('mean').reset_index()
d1

In [None]:
plt.figure(figsize=(10,8))
sns.lineplot(x='release_year', y='revenue', data=d1)
plt.title("Avg Revenue by Year",fontsize=20)
labels = np.arange(1920,2019,4)
plt.xticks(labels,fontsize=12,rotation=90)
plt.show()

In [None]:
d1 = train.groupby('day_of_week')['revenue'].agg('mean').reset_index()
plt.figure(figsize=(15,5))
sns.lineplot(x='day_of_week', y='revenue', data=d1)
plt.title("Avg Revenue by Day of Week",fontsize=20)
plt.xticks(fontsize=12)
plt.show()

In [None]:
d1 = train.groupby('release_quarter')['revenue'].agg('mean').reset_index()
plt.figure(figsize=(15,5))
sns.lineplot(x='release_quarter', y='revenue', data=d1)
plt.title("Avg Revenue by Day of Quarter",fontsize=20)
plt.xticks(fontsize=12)
plt.show()

In [None]:
d1 = train.groupby('release_year')['runtime'].agg('mean').reset_index()
plt.figure(figsize=(15,5))
sns.lineplot(x='release_year', y='runtime', data=d1)
plt.title("Avg runtime by Year",fontsize=20)
plt.xticks(fontsize=12)
plt.show()

In [None]:
d1 = train.groupby('release_year')['popularity'].agg('mean').reset_index()
plt.figure(figsize=(15,5))
sns.lineplot(x='release_year', y='popularity', data=d1)
plt.title("Avg poularity by Year",fontsize=20)
plt.xticks(fontsize=12)
plt.show()

In [None]:
d1 = train.groupby('release_year')['budget','revenue'].agg('mean').reset_index()
d1

In [None]:
plt.figure(figsize=(15,10))
plt.plot(d1['release_year'], d1[['budget','revenue']], color="g")
plt.xticks(np.arange(1920,2018,4), rotation=90)
plt.xlabel("Years")
plt.ylabel("revenue & budget")
plt.show()

In [None]:
train.head()

In [None]:
test.head()

In [None]:
genres = train.genres_temp.str.get_dummies(sep=',')
genres

In [None]:
train = pd.concat([train, genres], axis=1, sort=False)

train.head()

In [None]:
train.columns

In [None]:
genres=['Action', 'Adventure', 'Animation',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
       'Foreign', 'History', 'Horror', 'Music', 'Mystery', 'Romance',
       'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']

counter=0

fig, ax= plt.subplots(7, 3, figsize=[20,15])

for j in range(len(ax)):
    for i in range(3):
        if j==6 & i==2:
            break
        else:
            ax[j][i] = sns.violinplot(x=genres[counter], y='revenue', data=train, ax=ax[j][i])
            ax[j][i].set_xlabel(genres[counter])
            ax[j][i].set_ylabel('Revenue')
            counter+=1
            
fig.delaxes(ax=ax[2,2])            

In [None]:
train['belongs'] = train['belongs_to_collection'].apply(lambda x: 1 if x != {} else 0)

sns.boxplot(x = 'belongs', y='revenue', data=train)

In [None]:
train.columns

In [None]:
prod_comp = train.production_companies_temp.str.get_dummies(sep=',')
prod_comp

In [None]:
cast_comp = train.cast_temp.str.get_dummies(sep=',')
cast_comp

In [None]:
x = prod_comp.sum(axis=0).sort_values(ascending=False).head(15).reset_index()

In [None]:
x

In [None]:
y = cast_comp.sum(axis=0).sort_values(ascending=False).head(30).reset_index()
y

In [None]:
l1 = list(x.loc[:,'index'])
l1

In [None]:
l2 = list(y.loc[:,'index'])

In [None]:
prod_comp = prod_comp.drop(prod_comp.columns.difference(l1), axis=1)
prod_comp

In [None]:
cast_comp = cast_comp.drop(cast_comp.columns.difference(l2), axis=1)
cast_comp

In [None]:
train = pd.concat([train, cast_comp], axis=1, sort=False)

In [None]:
train = pd.concat([train, prod_comp], axis=1, sort=False)

In [None]:
train.shape

In [None]:
prod_house=['Canal+', 'Columbia Pictures', 'Columbia Pictures Corporation',
       'Metro-Goldwyn-Mayer (MGM)', 'Miramax Films', 'New Line Cinema',
       'Paramount Pictures', 'Relativity Media', 'Touchstone Pictures',
       'TriStar Pictures', 'Twentieth Century Fox Film Corporation',
       'United Artists', 'Universal Pictures', 'Walt Disney Pictures',
       'Warner Bros.']

counter=0

fig, ax= plt.subplots(5, 3, figsize=[15,29])

for j in range(len(ax)):
    for i in range(3):
        if j==4 & i==2:
            break
        else:
            ax[j][i] = sns.boxplot(x=prod_house[counter], y='revenue', data=train, ax=ax[j][i])
            ax[j][i].set_xlabel(prod_house[counter])
            ax[j][i].set_ylabel('Revenue')
            counter+=1
            
#fig.delaxes(ax=ax[2,2])  

In [None]:
train.columns

In [None]:
train = train.drop(['log_revenue','belongs_to_collection','Keywords','cast','crew','release_month','day_of_week','release_quarter','production_companies_temp','genres_temp','poster_path','homepage'], axis=1)

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train['original_language'] = train['original_language'].apply(lambda x: 1 if x=='en' else 0)

In [None]:
train['production_countries_temp'].unique()

In [None]:
pip install xgboost

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor as GBC
from sklearn.model_selection import GridSearchCV,cross_val_score

In [None]:
xgm=XGBRegressor()
GBM=GBC()

In [None]:
train.head()

In [None]:
train.drop(train.iloc[:, 46:61], axis=1, inplace=True)

In [None]:
best_gbm = GridSearchCV(GBM, param_grid = {'learning_rate':[0.01,0.05,0.1], 'max_depth':[1,2,3], 'n_estimators':[100,200,500]}, cv=5, n_jobs=-1)
best_xgm = GridSearchCV(xgm, param_grid = {'learning_rate':[0.01,0.05,0.1], 'max_depth':[1,2,3], 'n_estimators':[100,200,500]}, cv=5, n_jobs=-1)

In [None]:
X=train.drop(['revenue','id','genres','imdb_id','overview','production_companies','production_countries','release_date','spoken_languages','status','tagline','title','belongs_to_collection_temp','spoken_languages_temp','Keywords_temp','crew_temp','original_title','release_fulldate','production_countries_temp','cast_temp'], axis=1)
Y=train['revenue']

In [None]:
X.isna().sum()

In [None]:
X['runtime'] = X['runtime'].fillna((X['runtime'].mean()))

In [None]:
X.head()

In [None]:
best_gbm.fit(X, Y)
best_xgm.fit(X, Y)

In [None]:
cross_val_score(best_gbm.best_estimator_, X=X, y=Y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

In [None]:
cross_val_score(best_xgm.best_estimator_, X=X, y=Y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)