## Load the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the data

In [2]:
data = pd.read_csv('Movie_regression.csv')
data.head()

Unnamed: 0,Marketing expense,Production expense,Multiplex coverage,Budget,Movie_length,Lead_ Actor_Rating,Lead_Actress_rating,Director_rating,Producer_rating,Critic_rating,Trailer_views,3D_available,Time_taken,Twitter_hastags,Genre,Avg_age_actors,Num_multiplex,Collection
0,20.1264,59.62,0.462,36524.125,138.7,7.825,8.095,7.91,7.995,7.94,527367,YES,109.6,223.84,Thriller,23,494,48000
1,20.5462,69.14,0.531,35668.655,152.4,7.505,7.65,7.44,7.47,7.44,494055,NO,146.64,243.456,Drama,42,462,43200
2,20.5458,69.14,0.531,39912.675,134.6,7.485,7.57,7.495,7.515,7.44,547051,NO,147.88,2022.4,Comedy,38,458,69400
3,20.6474,59.36,0.542,38873.89,119.3,6.895,7.035,6.92,7.02,8.26,516279,YES,185.36,225.344,Drama,45,472,66800
4,21.381,59.36,0.542,39701.585,127.7,6.92,7.07,6.815,7.07,8.26,531448,NO,176.48,225.792,Drama,55,395,72400


## Observation:

1. Collection is the target(dependent) variable in the data.

## Data Cleaning, Data Preprocessing

In [3]:
data.isnull().sum()

Marketing expense       0
Production expense      0
Multiplex coverage      0
Budget                  0
Movie_length            0
Lead_ Actor_Rating      0
Lead_Actress_rating     0
Director_rating         0
Producer_rating         0
Critic_rating           0
Trailer_views           0
3D_available            0
Time_taken             12
Twitter_hastags         0
Genre                   0
Avg_age_actors          0
Num_multiplex           0
Collection              0
dtype: int64

In [4]:
data['Time_taken']

0      109.60
1      146.64
2      147.88
3      185.36
4      176.48
        ...  
501    186.96
502    132.24
503    109.56
504    158.80
505    205.60
Name: Time_taken, Length: 506, dtype: float64

In [6]:
## Mode imputation works best for the time_taken column

data['Time_taken'] = data['Time_taken'].fillna(data['Time_taken'].mode()[0])
data.isnull().sum()

Marketing expense      0
Production expense     0
Multiplex coverage     0
Budget                 0
Movie_length           0
Lead_ Actor_Rating     0
Lead_Actress_rating    0
Director_rating        0
Producer_rating        0
Critic_rating          0
Trailer_views          0
3D_available           0
Time_taken             0
Twitter_hastags        0
Genre                  0
Avg_age_actors         0
Num_multiplex          0
Collection             0
dtype: int64

## Feature Encoding

In [8]:
dic = {'NO' : 0, 'YES' : 1}

data['3D_available'] = data['3D_available'].replace(dic)
data.head()

Unnamed: 0,Marketing expense,Production expense,Multiplex coverage,Budget,Movie_length,Lead_ Actor_Rating,Lead_Actress_rating,Director_rating,Producer_rating,Critic_rating,Trailer_views,3D_available,Time_taken,Twitter_hastags,Genre,Avg_age_actors,Num_multiplex,Collection
0,20.1264,59.62,0.462,36524.125,138.7,7.825,8.095,7.91,7.995,7.94,527367,1,109.6,223.84,Thriller,23,494,48000
1,20.5462,69.14,0.531,35668.655,152.4,7.505,7.65,7.44,7.47,7.44,494055,0,146.64,243.456,Drama,42,462,43200
2,20.5458,69.14,0.531,39912.675,134.6,7.485,7.57,7.495,7.515,7.44,547051,0,147.88,2022.4,Comedy,38,458,69400
3,20.6474,59.36,0.542,38873.89,119.3,6.895,7.035,6.92,7.02,8.26,516279,1,185.36,225.344,Drama,45,472,66800
4,21.381,59.36,0.542,39701.585,127.7,6.92,7.07,6.815,7.07,8.26,531448,0,176.48,225.792,Drama,55,395,72400


In [9]:
data_ohe = pd.get_dummies(data['Genre'])
data_ohe

Unnamed: 0,Action,Comedy,Drama,Thriller
0,0,0,0,1
1,0,0,1,0
2,0,1,0,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
501,1,0,0,0
502,1,0,0,0
503,0,1,0,0
504,0,1,0,0


In [10]:
data = pd.concat([data, data_ohe], axis = 1)
data.head()

Unnamed: 0,Marketing expense,Production expense,Multiplex coverage,Budget,Movie_length,Lead_ Actor_Rating,Lead_Actress_rating,Director_rating,Producer_rating,Critic_rating,...,Time_taken,Twitter_hastags,Genre,Avg_age_actors,Num_multiplex,Collection,Action,Comedy,Drama,Thriller
0,20.1264,59.62,0.462,36524.125,138.7,7.825,8.095,7.91,7.995,7.94,...,109.6,223.84,Thriller,23,494,48000,0,0,0,1
1,20.5462,69.14,0.531,35668.655,152.4,7.505,7.65,7.44,7.47,7.44,...,146.64,243.456,Drama,42,462,43200,0,0,1,0
2,20.5458,69.14,0.531,39912.675,134.6,7.485,7.57,7.495,7.515,7.44,...,147.88,2022.4,Comedy,38,458,69400,0,1,0,0
3,20.6474,59.36,0.542,38873.89,119.3,6.895,7.035,6.92,7.02,8.26,...,185.36,225.344,Drama,45,472,66800,0,0,1,0
4,21.381,59.36,0.542,39701.585,127.7,6.92,7.07,6.815,7.07,8.26,...,176.48,225.792,Drama,55,395,72400,0,0,1,0


In [11]:
data = data.drop('Genre', axis = 1)
data.head()

Unnamed: 0,Marketing expense,Production expense,Multiplex coverage,Budget,Movie_length,Lead_ Actor_Rating,Lead_Actress_rating,Director_rating,Producer_rating,Critic_rating,...,3D_available,Time_taken,Twitter_hastags,Avg_age_actors,Num_multiplex,Collection,Action,Comedy,Drama,Thriller
0,20.1264,59.62,0.462,36524.125,138.7,7.825,8.095,7.91,7.995,7.94,...,1,109.6,223.84,23,494,48000,0,0,0,1
1,20.5462,69.14,0.531,35668.655,152.4,7.505,7.65,7.44,7.47,7.44,...,0,146.64,243.456,42,462,43200,0,0,1,0
2,20.5458,69.14,0.531,39912.675,134.6,7.485,7.57,7.495,7.515,7.44,...,0,147.88,2022.4,38,458,69400,0,1,0,0
3,20.6474,59.36,0.542,38873.89,119.3,6.895,7.035,6.92,7.02,8.26,...,1,185.36,225.344,45,472,66800,0,0,1,0
4,21.381,59.36,0.542,39701.585,127.7,6.92,7.07,6.815,7.07,8.26,...,0,176.48,225.792,55,395,72400,0,0,1,0
