In [2]:
import pandas as pd

In [3]:
ratings = pd.read_csv('data/imdb/title.ratings.tsv.gz', sep='\t', compression='gzip', low_memory=False)

In [4]:
basics = pd.read_csv('data/imdb/title.basics.tsv.gz', sep='\t', compression='gzip', low_memory=False)

In [5]:
merged_base = pd.merge(ratings, basics, on='tconst', how='inner')
merged_base = merged_base[merged_base['titleType'] == 'tvSeries']

In [6]:
merged_df = merged_base.copy()
merged_df['endYear'] = merged_df['endYear'].where(merged_df['endYear'] != '\\N', merged_df['startYear'])
merged_df['endYear'] = pd.to_numeric(merged_df['endYear'], errors='coerce')
merged_df = merged_df[merged_df['endYear'] > 2015]

In [7]:
merged_df1 = merged_df[merged_df['numVotes'] > 1000]
print(merged_df1.count().unique())

[5499]


In [8]:
merged_df1 = merged_df1[merged_df1['averageRating'] > 6.0]
print(merged_df1.count().unique())

[4859]


In [9]:
def filter_genres(genres):
    s = genres.split(',')
    if 'Sci-Fi' in s or 'Fantasy' in s or 'Horror' in s or 'Action' in s or 'Animation' in s or 'Adventure' in s or 'Reality-TV' in s or 'Game-Show' in s or 'Talk-Show' in s or 'News' in s or 'Documentary' in s:
        return 'OUT'
    return genres

In [10]:
merged_df2 = merged_df1.copy()

merged_df2['genres'] = merged_df2['genres'].apply(filter_genres)
merged_df2 = merged_df2[merged_df2['genres'] != 'OUT']

In [11]:
merged_df2.describe()

Unnamed: 0,averageRating,numVotes,endYear
count,2590.0,2590.0,2590.0
mean,7.43166,16462.19,2020.893436
std,0.673135,55469.55,2.588159
min,6.1,1001.0,2016.0
25%,6.9,1931.75,2019.0
50%,7.4,4034.5,2021.0
75%,7.9,10320.25,2023.0
max,9.5,1056479.0,2026.0


In [12]:
sorted_df = merged_df2[merged_df2['primaryTitle'] == merged_df2['originalTitle']].sort_values(by=['averageRating'], ascending=[False])
sorted_df.head(10000) # 8.2 Exit

Unnamed: 0,tconst,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
547309,tt11207734,9.5,2090,tvSeries,Londra Merkez,Londra Merkez,0,2018,2018.0,\N,"Comedy,Sport"
478232,tt10116578,9.5,1945,tvSeries,Call Me Kevin,Call Me Kevin,0,2016,2016.0,\N,Comedy
1241710,tt36628618,9.4,2469,tvSeries,Sarpanch Sahab,Sarpanch Sahab,0,2025,2025.0,\N,Drama
1122053,tt30263074,9.3,74305,tvSeries,Sapne Vs Everyone,Sapne Vs Everyone,0,2023,2023.0,\N,Drama
1356918,tt5282604,9.2,1332,tvSeries,Sumit Sambhal Lega,Sumit Sambhal Lega,0,2015,2016.0,\N,Comedy
...,...,...,...,...,...,...,...,...,...,...,...
1577614,tt9355244,6.1,1185,tvSeries,Çok Güzel Hareketler 2,Çok Güzel Hareketler 2,0,2019,2019.0,120,Comedy
1562715,tt9039142,6.1,4030,tvSeries,No Good Nick,No Good Nick,0,2019,2019.0,30,"Comedy,Drama,Family"
868082,tt1830491,6.1,14347,tvSeries,Austin & Ally,Austin & Ally,0,2011,2016.0,23,"Comedy,Drama,Family"
1151539,tt31849826,6.1,7905,tvSeries,Doctor Odyssey,Doctor Odyssey,0,2024,2025.0,\N,Drama


In [13]:
merged_base[merged_base['primaryTitle'].str.contains('Neighbors', na=False)]

Unnamed: 0,tconst,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
93252,tt0121947,7.2,20,tvSeries,Mr. Jones and His Neighbors,Mr. Jones and His Neighbors,0,1944,\N,\N,\N
234605,tt0406439,6.2,629,tvSeries,My Lovely Neighbors,Mis adorables vecinos,0,2004,2006,60,Comedy
536140,tt1104037,4.0,672,tvSeries,Neighbors Forever,Zauvijek susjedi,0,2007,2008,30,Comedy
649605,tt13042560,8.6,10,tvSeries,Neighbors,Neighbors,0,2020,\N,106,"Comedy,Drama"
675507,tt13482620,5.3,18,tvSeries,Neighbors & Friends,Neighbors & Friends,0,2017,\N,\N,Reality-TV
801160,tt1592270,5.8,822,tvSeries,Neighbors from Hell,Neighbors from Hell,0,2010,2025,30,"Animation,Comedy"
955470,tt2182229,6.9,9093,tvSeries,The Neighbors,The Neighbors,0,2012,2014,30,"Comedy,Family,Romance"
1103761,tt29335687,3.9,23,tvSeries,Neighbors,Hamsayeha,0,2016,2016,\N,Comedy
1125713,tt30459041,7.7,31241,tvSeries,Your Friends & Neighbors,Your Friends & Neighbors,0,2025,\N,\N,Drama
1202086,tt3432810,2.6,577,tvSeries,The Neighbors,The Neighbors,0,2014,2016,24,Comedy


In [14]:
tsv = pd.read_csv('data/tmdb/tvs.csv', sep=',', low_memory=False)

In [17]:
tsv.describe()

Unnamed: 0,id,number_of_episodes,number_of_seasons,genres[0].id,genres[1].id,genres[2].id,genres[3].id,genres[4].id,genres[5].id,genres[6].id,...,production_companies[9].id,production_companies[10].id,production_companies[11].id,production_companies[12].id,production_companies[13].id,production_companies[14].id,production_companies[15].id,production_companies[16].id,production_companies[17].id,__v
count,152970.0,152336.0,152970.0,89608.0,32357.0,9282.0,2027.0,370.0,70.0,12.0,...,61.0,36.0,23.0,11.0,7.0,6.0,3.0,3.0,1.0,2169.0
mean,97322.237916,24.493724,1.546029,2773.81784,4629.259604,6565.700065,7550.228416,7507.908108,8031.4,4510.833333,...,95481.491803,78937.583333,73179.869565,61701.0,64891.428571,23327.833333,37427.0,19389.0,1023.0,0.0
std,67273.493146,135.570884,3.016216,4647.295612,5246.055694,5118.603134,4815.107891,4815.136201,4571.057916,5520.824079,...,77662.844724,74402.587761,68198.621721,59264.607941,74751.48644,52211.170297,55332.046528,15927.228667,,0.0
min,1.0,0.0,0.0,16.0,16.0,16.0,16.0,16.0,18.0,18.0,...,201.0,25.0,297.0,124.0,310.0,104.0,46.0,1678.0,1023.0,0.0
25%,40990.25,1.0,1.0,18.0,18.0,35.0,80.0,80.0,9648.0,30.75,...,9148.0,8984.0,10473.5,9310.0,7957.0,1632.25,5645.0,12815.5,1023.0,0.0
50%,87499.5,6.0,1.0,35.0,80.0,10751.0,10759.0,10759.0,10762.0,89.5,...,95861.0,59347.0,65646.0,58209.0,20498.0,2507.0,11244.0,23953.0,1023.0,0.0
75%,134069.5,20.0,1.0,9648.0,10759.0,10762.0,10765.0,10765.0,10765.0,10765.0,...,166410.0,152159.25,105428.0,92490.0,112472.5,3351.75,56117.5,28244.5,1023.0,0.0
max,228550.0,19925.0,345.0,10768.0,10768.0,10768.0,10768.0,10768.0,10768.0,10765.0,...,201059.0,196955.0,198852.0,162015.0,192573.0,129875.0,100991.0,32536.0,1023.0,0.0


In [44]:
tsv[tsv['name'] == 'Neighbors from Hell'].T

Unnamed: 0,30795
_id,6457e58c7b901e08b142afb6
id,32858
name,Neighbors from Hell
original_name,Neighbors from Hell
overview,"The Hellmans are a typical, all-American subur..."
...,...
production_companies[14]._id,
production_companies[15]._id,
production_companies[16]._id,
production_companies[17]._id,


In [27]:
tsv.columns
Index(['_id', 'id', 'name', 'original_name', 'overview', 'tagline',
       'in_production', 'status', 'original_language', 'origin_country[0]',
       ...
       'production_companies[9]._id', 'production_companies[10]._id',
       'production_companies[11]._id', 'production_companies[12]._id',
       'production_companies[13]._id', 'production_companies[14]._id',
       'production_companies[15]._id', 'production_companies[16]._id',
       'production_companies[17]._id', '__v'],
      dtype='object', length=188)

Index(['_id', 'id', 'name', 'original_name', 'overview', 'tagline',
       'in_production', 'status', 'original_language', 'origin_country[0]',
       ...
       'production_companies[9]._id', 'production_companies[10]._id',
       'production_companies[11]._id', 'production_companies[12]._id',
       'production_companies[13]._id', 'production_companies[14]._id',
       'production_companies[15]._id', 'production_companies[16]._id',
       'production_companies[17]._id', '__v'],
      dtype='object', length=188)

In [39]:
tsv = pd.read_csv('data/tmdb/tvs.csv', sep=',', low_memory=False)

tsv1 = tsv[['name', 'overview', 'tagline', 'original_language', 'origin_country[0]', 'origin_country[1]', 'origin_country[2]', 'vote_average', 'vote_count',
            'first_air_date', 'last_air_date', 'number_of_seasons', 'number_of_episodes', 'created_by[0].name', 'created_by[1].name', 'created_by[2].name', 'production_companies[0].name', 'production_companies[0].origin_country']]

In [29]:
tsv1.head(10)

Unnamed: 0,name,overview,tagline,original_language,origin_country[0],vote_average,vote_count,first_air_date,last_air_date,number_of_seasons,created_by[0].name,production_companies[0].name,production_companies[0].origin_country
0,How do you like Wednesday?,How do you like Wednesday? was a Japanese tele...,,ja,JP,9.2,2,1996-10-09,2022-02-16,4,,,
1,Clerks,The continuing adventures of store clerks Dant...,,en,US,6.897,78,2000-05-31,2002-12-22,1,Kevin Smith,Touchstone Television,US
2,Shuriken School,Shuriken School is a British animated series t...,,en,FR,5.5,6,2006-08-20,2006-11-11,1,,Zinkia Entertainment,
3,Mister Rogers' Neighborhood,Mister Rogers' Neighborhood is an American chi...,Won't you be my neighbor?,en,US,4.977,88,1968-02-19,2001-09-07,31,Fred Rogers,WQED,US
4,Sidewalks: Video Nite,Sidewalks: Video Nite is an American televisio...,,en,US,0.0,0,,2005-09-09,11,Richard R. Lee,,
5,W*A*L*T*E*R,W*A*L*T*E*R is a pilot for a spin-off of M*A*S...,,en,US,5.643,7,1984-07-17,,0,Bill Bixby,20th Century Fox Television,US
6,Star Wars: Droids,An animated television series that features th...,,en,CA,5.2,66,1985-09-07,1985-11-30,1,George Lucas,Lucasfilm Animation,US
7,Bratz,Bratz is a computer-animated television series...,,en,US,8.429,7,,,2,,,
8,"Mary Hartman, Mary Hartman","In the fictional town of Fernwood, Ohio, subur...",,en,US,6.2,8,1976-01-05,1977-03-10,2,Gail Parent,TAT Communications Company,
9,French Fields,French Fields is a British situation comedy. I...,,en,GB,7.3,3,1989-09-05,1991-10-08,3,,,


In [40]:
full_df = pd.merge(sorted_df, tsv1, left_on='primaryTitle', right_on='name', how='inner')

In [34]:
full_df.head(10)

Unnamed: 0,tconst,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,...,original_language,origin_country[0],vote_average,vote_count,first_air_date,last_air_date,number_of_seasons,created_by[0].name,production_companies[0].name,production_companies[0].origin_country
0,tt11207734,9.5,2090,tvSeries,Londra Merkez,Londra Merkez,0,2018,2018.0,\N,...,tr,,0.0,0,2018-02-09,2018-02-09,1,,,
1,tt5282604,9.2,1332,tvSeries,Sumit Sambhal Lega,Sumit Sambhal Lega,0,2015,2016.0,\N,...,hi,IN,0.0,0,2015-08-31,2016-01-04,1,,,
2,tt12187604,9.2,1573,tvSeries,Thanamalvila Kollek,Thanamalvila Kollek,0,2020,2020.0,\N,...,si,LK,9.0,1,2020-01-19,2020-04-25,1,Roshan Ravindra,,
3,tt8788222,9.2,1723,tvSeries,Scott the Woz,Scott the Woz,0,2017,2017.0,\N,...,en,US,10.0,5,2021-12-07,2022-02-08,1,Scott Wozniak,,
4,tt7651452,9.1,1261,tvSeries,Bahu Hamari Rajni Kant,Bahu Hamari Rajni Kant,0,2016,2017.0,20,...,en,,0.0,0,2016-01-01,2016-01-01,1,,,
5,tt4742876,9.1,76224,tvSeries,TVF Pitchers,TVF Pitchers,0,2015,2022.0,40,...,hi,IN,7.544,68,2015-06-03,2022-12-23,2,Arunabh Kumar,The Viral Fever,IN
6,tt11357084,9.1,2271,tvSeries,Alif,Alif,0,2019,2020.0,39,...,ur,PK,9.0,2,2019-10-05,2020-03-14,1,Haseeb Hasan,Epic Entertainment,
7,tt15218050,9.1,4394,tvSeries,Parizaad,Parizaad,0,2021,2021.0,40,...,ur,PK,8.7,3,2021-07-20,2022-02-01,1,Hashim Nadeem,Momina Duraid Productions,PK
8,tt10530900,9.1,27122,tvSeries,Gullak,Gullak,0,2019,2019.0,24,...,hi,IN,8.308,13,2019-06-27,2022-04-07,3,Nikhil Vijay,Contagious Online Media Network,IN
9,tt14392248,9.1,316352,tvSeries,Aspirants,Aspirants,0,2021,2021.0,45,...,hi,IN,7.452,21,2021-04-07,2021-05-08,1,Arunabh Kumar,The Viral Fever,IN


In [33]:
full_df.columns

Index(['tconst', 'averageRating', 'numVotes', 'titleType', 'primaryTitle',
       'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes',
       'genres', 'name', 'overview', 'tagline', 'original_language',
       'origin_country[0]', 'vote_average', 'vote_count', 'first_air_date',
       'last_air_date', 'number_of_seasons', 'created_by[0].name',
       'production_companies[0].name',
       'production_companies[0].origin_country'],
      dtype='object')

In [41]:
full_df = full_df[['averageRating', 'numVotes', 'primaryTitle', 'startYear', 'endYear', 'genres', 'vote_average', 'vote_count', 'original_language', 'origin_country[0]', 'number_of_seasons', 'number_of_episodes', 'runtimeMinutes', 'created_by[0].name', 'production_companies[0].name', 'production_companies[0].origin_country', 'overview', 'tagline']]

In [43]:
full_df['origin_country[0]'].unique()

array([nan, 'IN', 'LK', 'US', 'PK', 'GB', 'IO', 'TR', 'CA', 'TN', 'TH',
       'PT', 'DE', 'JP', 'BD', 'AU', 'IT', 'EG', 'FR', 'BE', 'ES', 'IL',
       'KR', 'DK', 'CN', 'SE', 'NO', 'HU', 'NL', 'NZ', 'SG', 'PH', 'PL',
       'LB', 'CH', 'BR', 'RS', 'CO', 'IR', 'MX', 'RU', 'IE', 'AT', 'UA',
       'AR', 'TW', 'FI', 'RO', 'SA', 'BA', 'BG', 'ID', 'GR', 'PR', 'HK',
       'EE', 'CL', 'IS', 'SY', 'MY', 'QA', 'ZA', 'VE', 'KW', 'LU', 'XC',
       'IQ', 'SI'], dtype=object)