# Combined Datasets

In [15]:
# Importing dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Data

Data explanation placeholder

(Talk about original two notebooks?)

In [3]:
# Reading in data
df_movies = pd.read_csv("./Resources/movies_data.csv")
df_economics = pd.read_csv("./Resources/economics_data.csv")

### Movie Data

### Economics Data

# Combining Data

In [4]:
# Creating a 'Date' for a datetime index
df_movies['Date'] = pd.to_datetime({
    'year': df_movies['released_year'],
    'month': df_movies['released_month'],
    'day': df_movies['released_day']
})

# Setting `Date` as index
df_movies.set_index('Date', inplace=True)

# Ensuring index is sorted with ascending dates
df_movies.sort_index(inplace=True)

In [5]:
# Creating a `Year` and `Month` for concatenation
df_economics['Year'] = df_economics['Date'].str.slice(0,4).astype(int)
df_economics['Month'] = df_economics['Date'].str.slice(5,7).astype(int)

# Renaming to `Year` and `Month` for concatenation
df_movies.rename(columns={
'released_year': 'Year',
'released_month': 'Month'
}, inplace=True)

In [6]:
# Confirming total records before concatenation
print(f'Total ecomonic records: {df_economics.shape[0]}')
print(f'Total movie records: {df_movies.shape[0]}')

Total ecomonic records: 507
Total movie records: 15363


In [7]:
# Combining datasets through concatenation
df_combined = pd.merge(df_economics, df_movies, how='left', on=['Year', 'Month'])

# Confirming total records after concatenation
print(f'Total records: {df_combined.shape[0]}')

Total records: 12188


# EDA

In [8]:
# Creating the eventual `Target` for modelling
df_combined['Target'] = df_combined['critical_success'] + ' ' +\
                        df_combined['financial_success'] + ' ' +\
                        df_combined['Economic Climate']

In [21]:
df_combined['Target'].value_counts()

Target
panned failure Lean to Bad                                    2200
well liked failure Lean to Bad                                1897
well liked failure Comfortable to Good                        1316
panned failure Comfortable to Good                            1195
alright failure Lean to Bad                                   1050
alright failure Comfortable to Good                            753
well liked excellent returns Lean to Bad                       709
well liked excellent returns Comfortable to Good               527
critical success failure Lean to Bad                           477
well liked extraordinary returns Lean to Bad                   292
critical success failure Comfortable to Good                   251
well liked modest returns Lean to Bad                          221
well liked extraordinary returns Comfortable to Good           201
well liked moderate returns Lean to Bad                        193
well liked modest returns Comfortable to Good          

In [9]:
# Creating a list of features to drop
cols_to_drop = [
    'Economic Climate',
    'Year',
    'Month',
    'id',
    'critical_success',
    'financial_success',
    'released_day'
]

# Dropping unneeded features
df_combined.drop(columns=cols_to_drop, inplace=True)

In [11]:
df_combined.dtypes

Date                                              object
CCI Value                                        float64
CCI Rolling Mean                                 float64
CCI Rolling Percent Change                       float64
CPI Value                                        float64
CPI Rolling Mean                                 float64
CPI Rolling Percent Change                       float64
Unemployment Rate (%)                            float64
Unemployment Rate (%) Rolling Mean               float64
Unemployment Rate Rolling Percent Change         float64
CCI Rolling Percent Change Flag                   object
CPI Rolling Percent Change Flag                   object
Unemployment Rate Rolling Percent Change Flag     object
title                                             object
vote_average                                     float64
vote_count                                       float64
status                                            object
release_date                   

# Train Test Splitting

In [18]:
# Defining columns to scale and encode
col_to_scale = [
    'CCI Value', 'CCI Rolling Mean', 'CCI Rolling Percent Change',
    'CPI Value', 'CPI Rolling Mean', 'CPI Rolling Percent Change',
    'Unemployment Rate (%)', 'Unemployment Rate (%) Rolling Mean',
    'Unemployment Rate Rolling Percent Change','vote_average', 'vote_count',
    'revenue','runtime','budget', 'popularity', 'roi'
]

col_to_encode = [
    'Date','CCI Rolling Percent Change Flag', 'CPI Rolling Percent Change Flag',
    'Unemployment Rate Rolling Percent Change Flag', 'title',
    'status', 'release_date','original_language', 'original_title',
    'genres', 'production_companies', 'production_countries',
    'spoken_languages', 'cast', 'director', 'writers', 'producers'
]

# Setup X and y variables
X = df_combined.drop(columns='Target')
y = df_combined['Target']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

# Scaling and Econding

In [19]:
scaler = StandardScaler()
X_train_scaled = scaler.fit(X_train[col_to_scale])

In [20]:
X_train_scaled = scaler.transform(X_train[col_to_scale])
X_test_scaled = scaler.transform(X_test[col_to_scale])

In [None]:
encoder = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
encoder.fit(X_train[col_to_encode], y_train)

# Transform each column into numpy arrays
X_train_encoded = encoder.transform(X_train[col_to_encode]).values.reshape(-1, 1)
X_test_encoded = encoder.transform(X_test[col_to_encode]).values.reshape(-1, 1)
y_train_encoded = encoder.transform(y_train).values.reshape(-1, 1)
y_test_encoded = encoder.transform(y_test).values.reshape(-1, 1)

# Reorganize the numpy arrays into a DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names())
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names())

# Concatenate the encoded columns with the scaled columns
X_train = pd.concat([X_train_scaled, X_train_encoded], axis=1)
X_test = pd.concat([X_test_scaled, X_test_encoded], axis=1)

# Modeling

Playtime!!

# Eric's Space

# Funda's Space

# Kalvin's Space

# Odele's Space

# Peta's Space

# Vadim's Space

# Findings

# Additional