Loading of the Dataset

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

#Load the dataset
tv_show_data = pd.read_csv("TV_show_data.csv")

#Print the dataframe
print(tv_show_data.head())
print(tv_show_data.shape)


                         Name  \
0                Breaking Bad   
1                     Firefly   
2  Avatar: The Last Airbender   
3                    Sherlock   
4             Attack on Titan   

                                             Summary  Average Runtime  \
0  Breaking Bad follows protagonist Walter White,...             60.0   
1  Five hundred years in the future, a renegade c...             64.0   
2  Water. Earth. Fire. Air. Only the Avatar was t...             30.0   
3  Sherlock Holmes and Dr. John Watson's adventur...             90.0   
4  Known in Japan as Shingeki no Kyojin, many yea...             26.0   

     End Date Premiere Date                                       Genres  \
0  2008-01-20    2019-10-11               ['Drama', 'Crime', 'Thriller']   
1  2002-09-20    2002-12-20    ['Drama', 'Adventure', 'Science-Fiction']   
2  2005-02-21    2008-07-19           ['Action', 'Adventure', 'Fantasy']   
3  2010-07-25    2017-01-15                ['Drama', 'Crim

Preprocessing of the Dataset

In [30]:
# Convert date columns to datetime
tv_show_data['Premiere Date'] = pd.to_datetime(tv_show_data['Premiere Date'])
tv_show_data['End Date'] = pd.to_datetime(tv_show_data['End Date'])

# Extract useful features from dates
tv_show_data['Premiere Year'] = tv_show_data['Premiere Date'].dt.year
tv_show_data['End Year'] = tv_show_data['End Date'].dt.year

# Fill missing values in End Year with the latest year in the dataset
tv_show_data['End Year'].fillna(tv_show_data['End Year'].max(), inplace=True)

# Calculate the show's duration in years
tv_show_data['Duration Years'] = tv_show_data['End Year'] - tv_show_data['Premiere Year']

# Encode categorical variables
label_encoders = {}
for column in ['Genres', 'Type', 'Language', 'Network']:
    label_encoders[column] = LabelEncoder()
    tv_show_data[column] = label_encoders[column].fit_transform(tv_show_data[column].astype(str))

# Handle missing values for average runtime
imputer = SimpleImputer(strategy='mean')
tv_show_data['Average Runtime'] = imputer.fit_transform(tv_show_data[['Average Runtime']])

# Select features and target
features = ['Average Runtime', 'Genres', 'Type', 'Language', 'Network', 'Premiere Year', 'Duration Years']
target = 'Rating'

# Drop rows where target is NaN
tv_show_data.dropna(subset=[target], inplace=True)

# Fill any remaining NaN values in the features
imputer = SimpleImputer(strategy='mean')
tv_show_data[features] = imputer.fit_transform(tv_show_data[features])

# Define X and y
X = tv_show_data[features]
y = tv_show_data[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Linear Regression Model

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f"Linear Regression MSE: {mse_lr}")


Linear Regression MSE: 0.6499449996719135


Neural Network Model

In [32]:
from sklearn.neural_network import MLPRegressor
nn_model = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42)
nn_model.fit(X_train, y_train)
y_pred_nn = nn_model.predict(X_test)
mse_nn = mean_squared_error(y_test, y_pred_nn)
print(f"Neural Network MSE: {mse_nn}")


Neural Network MSE: 0.7258296376881096


Time Series Forecasting model

In [33]:
# Aggregate numeric data by year to analyze trends over time
numeric_columns = tv_show_data.select_dtypes(include=['number']).columns
numeric_columns = numeric_columns.drop('Premiere Year')  # Exclude 'Premiere Year' from numeric columns
tv_show_data_by_year = tv_show_data.groupby('Premiere Year')[numeric_columns].mean().reset_index()

# Splitting data into training and test sets
X_train_ts = tv_show_data_by_year['Premiere Year'].values[:-5].reshape(-1, 1)
y_train_ts = tv_show_data_by_year['Rating'].values[:-5]
X_test_ts = tv_show_data_by_year['Premiere Year'].values[-5:].reshape(-1, 1)
y_test_ts = tv_show_data_by_year['Rating'].values[-5:]

# Linear Regression for Time Series Forecasting
lr_model_ts = LinearRegression()
lr_model_ts.fit(X_train_ts, y_train_ts)
y_pred_ts = lr_model_ts.predict(X_test_ts)
mse_ts = mean_squared_error(y_test_ts, y_pred_ts)
print(f"Time Series Forecasting MSE: {mse_ts}")

Time Series Forecasting MSE: 0.036541795551928644


Model Comparison

In [34]:
model_performance = {
        'Linear Regression': mse_lr,
        'Neural Network': mse_nn,
        'Time Series Forecasting': mse_ts
    }

best_model = min(model_performance, key=model_performance.get)
model_performance, best_model
#print("The model that performs the best is ",best_model)

({'Linear Regression': 0.6499449996719135,
  'Neural Network': 0.7258296376881096,
  'Time Series Forecasting': 0.036541795551928644},
 'Time Series Forecasting')