In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import os

### Loading the Dataset

In [2]:
df = pd.read_csv(os.path.join('data', 'cinemaTicket_Ref.csv'))
df.head()

Unnamed: 0,film_code,cinema_code,total_sales,tickets_sold,tickets_out,show_time,occu_perc,ticket_price,ticket_use,capacity,date,month,quarter,day
0,1492,304,3900000,26,0,4,4.26,150000.0,26,610.328638,2018-05-05,5,2,5
1,1492,352,3360000,42,0,5,8.08,80000.0,42,519.80198,2018-05-05,5,2,5
2,1492,489,2560000,32,0,4,20.0,80000.0,32,160.0,2018-05-05,5,2,5
3,1492,429,1200000,12,0,1,11.01,100000.0,12,108.991826,2018-05-05,5,2,5
4,1492,524,1200000,15,0,3,16.67,80000.0,15,89.982004,2018-05-05,5,2,5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142524 entries, 0 to 142523
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   film_code     142524 non-null  int64  
 1   cinema_code   142524 non-null  int64  
 2   total_sales   142524 non-null  int64  
 3   tickets_sold  142524 non-null  int64  
 4   tickets_out   142524 non-null  int64  
 5   show_time     142524 non-null  int64  
 6   occu_perc     142399 non-null  float64
 7   ticket_price  142524 non-null  float64
 8   ticket_use    142524 non-null  int64  
 9   capacity      142399 non-null  float64
 10  date          142524 non-null  object 
 11  month         142524 non-null  int64  
 12  quarter       142524 non-null  int64  
 13  day           142524 non-null  int64  
dtypes: float64(3), int64(10), object(1)
memory usage: 15.2+ MB


### Data Transformation

In [4]:
df.isna().sum()

film_code         0
cinema_code       0
total_sales       0
tickets_sold      0
tickets_out       0
show_time         0
occu_perc       125
ticket_price      0
ticket_use        0
capacity        125
date              0
month             0
quarter           0
day               0
dtype: int64

In [5]:
df.duplicated().value_counts()

False    142418
True        106
Name: count, dtype: int64

In [6]:
df[['date']] = df[['date']].apply(pd.to_datetime)
df = df.drop(columns= ['total_sales','tickets_out','month','quarter','day','ticket_use'])
df = df.dropna()
df[['tickets_sold','show_time']] = df[['tickets_sold','show_time']].astype(int)
df['day'] = df['date'].dt.day.astype(int)
df['month'] = df['date'].dt.month.astype(int)
df['year'] = df['date'].dt.year.astype(int)
df = df.drop(columns= ['date'])
df.drop_duplicates(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 142295 entries, 0 to 142523
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   film_code     142295 non-null  int64  
 1   cinema_code   142295 non-null  int64  
 2   tickets_sold  142295 non-null  int64  
 3   show_time     142295 non-null  int64  
 4   occu_perc     142295 non-null  float64
 5   ticket_price  142295 non-null  float64
 6   capacity      142295 non-null  float64
 7   day           142295 non-null  int64  
 8   month         142295 non-null  int64  
 9   year          142295 non-null  int64  
dtypes: float64(3), int64(7)
memory usage: 11.9 MB


### Train Test Split

In [7]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['tickets_sold'])
y = df['tickets_sold']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(113836, 9) (28459, 9) (113836,) (28459,)


### Model Imports, Model Training and Model Performance Evaluation

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [13]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
}

In [14]:
param_grid = {
    'LinearRegression': {},
    'Ridge': {},
    'Lasso': {},
    'GradientBoostingRegressor': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.01, 0.001]},
    'RandomForestRegressor': {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10]},
    'KNeighborsRegressor': {'n_neighbors': [3, 5, 7]},
}

In [15]:
report = {}
r2_scores = []
for i in range(len(list(models))):
    model = list(models.values())[i]
    para = param_grid[list(models.keys())[i]]
    gs = GridSearchCV(model, para, cv=5, n_jobs=-1)
    gs.fit(X_train, y_train)
    report[list(models.keys())[i]] = gs.best_params_
    score = (list(models.keys())[i], r2_score(y_test, gs.best_estimator_.predict(X_test)))
    print(score)
    r2_scores.append(score)

('LinearRegression', 0.5471754409999273)
('Ridge', 0.5471754522776349)
('Lasso', 0.5472020605600052)
('GradientBoostingRegressor', 0.9973145130215344)
('RandomForestRegressor', 0.9994600603614587)
('KNeighborsRegressor', 0.6900407752200658)


In [16]:
report

{'LinearRegression': {},
 'Ridge': {},
 'Lasso': {},
 'GradientBoostingRegressor': {'learning_rate': 0.1, 'n_estimators': 200},
 'RandomForestRegressor': {'max_depth': None, 'n_estimators': 200},
 'KNeighborsRegressor': {'n_neighbors': 3}}