Libraries & Reading the Data

In [150]:
import pandas as pd
import numpy as np
import statistics
import regex as re

In [151]:
data = pd.read_csv("data/All_Streaming_Shows.csv")

In [152]:
data.head()

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Genre,Description,No of Seasons,Streaming Platform
0,Breaking Bad,2008,18+,9.5,100,"Crime,Drama","When Walter White, a New Mexico chemistry teac...",5Seasons,Netflix
1,Game of Thrones,2011,18+,9.3,99,"Action & Adventure,Drama",Seven noble families fight for control of the ...,8Seasons,"HBO MAX,HBO"
2,Rick and Morty,2013,18+,9.2,97,"Animation,Comedy",Rick is a mentally-unbalanced but scientifical...,4Seasons,"Free Services,HBO MAX,Hulu"
3,Stranger Things,2016,16+,8.8,96,"Drama,Fantasy","When a young boy vanishes, a small town uncove...",3Seasons,Netflix
4,The Boys,2019,18+,8.7,95,"Action & Adventure,Comedy",A group of vigilantes known informally as “The...,2Seasons,Prime Video


Data Preparation

In [153]:
data.describe(include=object)

Unnamed: 0,Series Title,Content Rating,Genre,Description,No of Seasons,Streaming Platform
count,12353,7232,12353,12353,12353,10370
unique,12109,5,858,11875,94,530
top,Kingdom,16+,-1,-1,1Season,Netflix
freq,4,2581,479,479,5204,1427


In [154]:
#IMDB Rating & R Rating are already numeric variables, so there is no need to change their type
data.dtypes

Series Title           object
Year Released           int64
Content Rating         object
IMDB Rating           float64
R Rating                int64
Genre                  object
Description            object
No of Seasons          object
Streaming Platform     object
dtype: object

In [155]:
#There are multiple missing values in Content Rating and IMDB Rating
#There are some weird values in R Rating and Genre columns ("-1")
empty_data=data.isnull().any(axis=1)
null_rows=data[empty_data]
print(null_rows)

                            Series Title  Year Released Content Rating  \
186                               Castle           2009            16+   
250                             Gomorrah           2014            18+   
337                       Masters of Sex           2013            18+   
345                                Louie           2010            18+   
350                            Continuum           2012            16+   
...                                  ...            ...            ...   
12348  A Fishing Story with Ronnie Green           2017            NaN   
12349                  CMT Most Shocking           2003            NaN   
12350   NHL Road to the Outdoor Classics           2016            NaN   
12351                         Addy Media           2018            NaN   
12352             My Dream Derelict Home           2014            NaN   

       IMDB Rating  R Rating                     Genre  \
186            8.1        83              Comedy,Crim

In [156]:
data['R Rating'].unique()

array([100,  99,  97,  96,  95,  94,  93,  92,  91,  90,  89,  88,  87,
        86,  85,  84,  83,  82,  81,  80,  79,  78,  77,  76,  75,  74,
        73,  72,  71,  70,  69,  68,  67,  66,  65,  64,  63,  62,  61,
        60,  59,  58,  57,  56,  55,  54,  53,  52,  51,  50,  49,  48,
        47,  46,  45,  44,  43,  42,  41,  40,  39,  38,  37,  36,  35,
        34,  33,  32,  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,
        21,  20,  19,  18,  17,  16,  15,  13,  10,  -1])

In [157]:
#We have 9371 rows with normal values
len(data[(data["Genre"]!="-1") & (data["R Rating"]!=-1) & (data["Description"]!="-1") & (data["Streaming Platform"])])


9371

In [158]:
#We filter out "-1" values from the dataset
data=data[(data["Genre"]!="-1") & (data["R Rating"]!=-1) & (data["Description"]!="-1") & (data["Streaming Platform"])]

In [159]:
#We replace empty values in "IMDB Rating" with their mode values based on the R Rating score
data["IMDB Rating"]=data.groupby("R Rating")["IMDB Rating"].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

In [160]:
#We drop NaN values from "IMDB Rating"
data.dropna(subset=["IMDB Rating"], inplace=True)

In [161]:
data["Genre"].unique()

array(['Crime,Drama', 'Action & Adventure,Drama', 'Animation,Comedy',
       'Drama,Fantasy', 'Action & Adventure,Comedy', 'Drama,History',
       'Action & Adventure,Crime', 'Action & Adventure,Animation',
       'Comedy,Romance', 'Comedy,2005', 'Comedy,Crime',
       'Action & Adventure,Anime', 'Drama,Science-Fiction',
       'Drama,Mystery', 'Fantasy,Crime', 'Thriller,Action & Adventure',
       'Drama,Thriller', 'Action & Adventure,Science-Fiction',
       'Drama,Horror', 'Comedy,LGBTQ', 'Comedy,2009', 'Drama,2007',
       'Comedy,Drama', 'Drama,Comedy', 'Documentary,Biography',
       'Comedy,Stand-up & Talk', 'Horror,Drama',
       'Drama,Action & Adventure', 'Drama,Romance', 'Drama,Sport',
       'Thriller,Biography', 'Science-Fiction,Animation', 'Comedy,2003',
       'Action & Adventure,Thriller', 'Thriller,LGBTQ', 'Comedy,2014',
       'Drama,2018', 'Drama,LGBTQ', 'Drama,Crime',
       'Action & Adventure,Fantasy', 'Comedy,2000', 'Biography,Drama',
       'Action & Adventure,H

In [162]:
#we make a set of unique streaming platforms
all_platforms=[]
for i in data["Streaming Platform"]:
    a=i.split(",")
    all_platforms.extend(a)

all_platforms=set(all_platforms)
print(all_platforms)


{'HBO', 'BBC America', 'Adult Swim', 'AcornTV', 'AMC Premiere', 'Netflix', 'TruTV', 'FYI', 'Hulu', 'TVLand', 'History', 'TLC', 'FX', 'Epix', 'Syfy', 'TBS', 'NatGeo', 'FOX', 'DIY', 'Food Network', 'Travel Channel', 'Hallmark', 'CNBC', 'BET+', 'ABC', 'Peacock Premium', 'CBS All Access', 'IndieFlix', 'Apple TV+', 'DC Universe', 'Free Services', 'Cartoon Network', 'Viceland', 'USA', 'Showtime', 'AMC', 'IFC', 'Hoopla', 'YouTube Premium', 'HGTV', 'NBC', 'TNT', 'Starz', 'Funimation', 'Disney', 'Disney+', 'VH1', 'Comedy Central', 'HBO MAX', 'Cinemax', 'MTV', 'Nick', 'Crunchyroll', 'BritBox', 'Science', 'BET', 'Bravo', 'Lifetime', 'A&E', 'Hallmark Movies Now', 'fuboTV', 'Shudder', 'Prime Video', 'Sundance'}


In [163]:
#Cleaning "Genre" column & creating dummy variables to replace different genres
for i in all_platforms:
    data["Genre"]=data["Genre"].str.replace(i,"")

data["Genre"]=data["Genre"].str.replace(r'\b\d{4}\b', '', regex=True)
data["Genre"]=data["Genre"].str.replace(r'\+', '', regex=True)
data["Genre"]=data["Genre"].apply(lambda x:"Unknown" if x=="-1" else x)
data["Genre"]=data["Genre"].str.replace(r'^,|(?<=,),$|,$', '', regex=True)
data["Genre"]=data["Genre"].str.strip()
data["Genre"]=data["Genre"].apply(lambda x:"Unknown" if x.strip()=="" else x)

data = data.join(
    data["Genre"].str.get_dummies(sep=',')
)

data=data.drop(columns=["Genre"])
data.head()

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Description,No of Seasons,Streaming Platform,-Fiction,Action & Adventure,...,MAX,Musical,Mystery,Reality,Romance,Sport,Stand-up & Talk,Thriller,Travel,Unknown
0,Breaking Bad,2008,18+,9.5,100,"When Walter White, a New Mexico chemistry teac...",5Seasons,Netflix,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Game of Thrones,2011,18+,9.3,99,Seven noble families fight for control of the ...,8Seasons,"HBO MAX,HBO",0,1,...,0,0,0,0,0,0,0,0,0,0
2,Rick and Morty,2013,18+,9.2,97,Rick is a mentally-unbalanced but scientifical...,4Seasons,"Free Services,HBO MAX,Hulu",0,0,...,0,0,0,0,0,0,0,0,0,0
3,Stranger Things,2016,16+,8.8,96,"When a young boy vanishes, a small town uncove...",3Seasons,Netflix,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Boys,2019,18+,8.7,95,A group of vigilantes known informally as “The...,2Seasons,Prime Video,0,1,...,0,0,0,0,0,0,0,0,0,0


In [164]:
#Content Rating variable
data["Content Rating"].unique()

array(['18+', '16+', '7+', 'all', nan, '13+'], dtype=object)

In [165]:
data["Content Rating"]=data["Content Rating"].str.replace("+", "")
data["Content Rating"]=data["Content Rating"]=data["Content Rating"].transform(lambda x: x.fillna("0"))
data["Content Rating"]=data["Content Rating"].str.replace("all", "0")
data["Content Rating"]=data["Content Rating"].apply(lambda x:"R Rated" if x=="18" else "Not R Rated")

In [166]:
data=data+pd.get_dummies(data["Content Rating"])
list(data.columns.values)

['-Fiction',
 'Action & Adventure',
 'Animation',
 'Anime',
 'Biography',
 'Children',
 'Comedy',
 'Content Rating',
 'Crime',
 'Cult',
 'Description',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Food',
 'Game Show',
 'Home & Garden',
 'Horror',
 'IMDB Rating',
 'LGBTQ',
 'MAX',
 'Musical',
 'Mystery',
 'No of Seasons',
 'Not R Rated',
 'R Rated',
 'R Rating',
 'Reality',
 'Romance',
 'Series Title',
 'Sport',
 'Stand-up & Talk',
 'Streaming Platform',
 'Thriller',
 'Travel',
 'Unknown',
 'Year Released']