# Analysis and cleaning of IMDB Extensive Dataset

source: https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset

file: https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset/download

In [1]:
# Import our dependencies
import pandas as pd
import numpy as np
import pandas as pd
import datetime
import re
import csv
import json
import os
import ast
import matplotlib.pyplot as plt
%matplotlib 

Using matplotlib backend: Qt5Agg


In [2]:
pd.set_option("max_columns", None)
# Read in "The IMbd Movies dataset"
# if this is read in as csv, each row is a string and dictionaries are not recognized.
df = pd.read_csv("Resources/IMDb movies.csv", low_memory=False)
df.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85855 entries, 0 to 85854
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          85855 non-null  object 
 1   title                  85855 non-null  object 
 2   original_title         85855 non-null  object 
 3   year                   85855 non-null  object 
 4   date_published         85855 non-null  object 
 5   genre                  85855 non-null  object 
 6   duration               85855 non-null  int64  
 7   country                85791 non-null  object 
 8   language               85022 non-null  object 
 9   director               85768 non-null  object 
 10  writer                 84283 non-null  object 
 11  production_company     81400 non-null  object 
 12  actors                 85786 non-null  object 
 13  description            83740 non-null  object 
 14  avg_vote               85855 non-null  float64
 15  vo

In [4]:
print(df.describe())
print(df.isnull().sum())

           duration      avg_vote         votes     metascore  \
count  85855.000000  85855.000000  8.585500e+04  13305.000000   
mean     100.351418      5.898656  9.493490e+03     55.896881   
std       22.553848      1.234987  5.357436e+04     17.784874   
min       41.000000      1.000000  9.900000e+01      1.000000   
25%       88.000000      5.200000  2.050000e+02     43.000000   
50%       96.000000      6.100000  4.840000e+02     57.000000   
75%      108.000000      6.800000  1.766500e+03     69.000000   
max      808.000000      9.900000  2.278845e+06    100.000000   

       reviews_from_users  reviews_from_critics  
count        78258.000000          74058.000000  
mean            46.040826             27.479989  
std            178.511411             58.339158  
min              1.000000              1.000000  
25%              4.000000              3.000000  
50%              9.000000              8.000000  
75%             27.000000             23.000000  
max          1

In [5]:
# Note: dataset only contains GroupLens ratings for movies with 100+ votes, updated 1/1/20
# 'imdb_title_id', 'title',  
#        'genre', 'duration', 'country', 'director', 'writer',
#        'production_company', 'actors', 'description', 'avg_vote', 'votes',
#        'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
#        'reviews_from_users', 'reviews_from_critics', 'release_year'],
# Difference on variable names:
#    TMDB                         IMDB                          comments
#    adult                        N/A
#    release_date                 date_published                IMDB year does not equal year from published_date, but it's close
#    created: rel_year            year                          compare to TMDB
#    original_language                                          which TMDB variable aligns with IMDB
#    spoken_languages (array)     language (single country)     different encoding for languages - needs resolution
#    production_companies (array) country                       same?
#    production countries (array) production company            same?
#    belongs to collection        N/A
#    homepage                     N/A
#    vote_average                 avg_vote                      ? same ? source Group Lens
#    vote_count                   votes                         ? same ? source Group Lens
#    runtime                      duration                      same?
#    budget                       budget                        IMDB there are strings indicating currencies
#    status                       N/A
#    genre (list)                 genre (list)                  may be the same?
#    N/A                          director
#    N/A                          writer
#    N/A                          actors
#    revenue                      usa_gross_income              IMDB there are strings indicating currencies
#    revenue                      worldwide_gross_income        IMDB there are strings indicating currencies
#    N/A                          metascore 
#    N/A                          reviews_from_users            appears to be a rating
#    N/A                          reviews_from_critics          appears to be a rating
# Variable Disposition:
# Drop: original_title, description
#     candidates for NLP: 
#     description variables:
# drop and save for later: 
#   director
#   writer
#   actors


# imdb_title_id             object convert to string
# title                     object
# original_title            object drop
# year                      object convert to numeric
# date_published            object convert to year
# genre                     object seems to include more than movies, e.g. tv, news, sports
# duration                   int64 none missing
# country                   object 
# language                  object keep
# director                  object keep
# writer                    object drop
# production_company        object drop
# actors                    object drop
# description               object drop
# avg_vote                 float64
# votes                      int64 none missing
# budget                    object should be numeric, mix of floats and strings
# usa_gross_income          object should be numeric, mix of floats and strings
# worlwide_gross_income     object should be numeric, mix of floats and strings
# metascore                float64 keep
# reviews_from_users       float64 keep
# reviews_from_critics     float64 keep

# Transform: 
# Transform: 
# Transform to binary: 
# Transform to categories: 
# Transform to numeric:
# handling null values:
#   rel_year has one missing value, all values look valid
#   year has no missing values, all values look valid
# handling zero values
#
# Keep: 
#   float: 
#   categorical: 
#   identifiers:
#
# filter: release_year>1969 and status = Released

# Issues:
# two dates in IMDB file: year and dae_published. for many movies they do not match. Is either the same as release date in TMDB?

# Problems: need to be parsed
# production_countries      object  drop - too many to be useful in time
# production_companies      object  drop - too many to be useful in time
# actors                    object  drop - too many to be useful in time
# writers                   object  drop - too many to be useful in time
# genres                    parse



In [6]:
df.head(10)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0
5,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,Gene Gauntier,Kalem Company,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,484,,,,,13.0,5.0
6,tt0002423,Madame DuBarry,Madame DuBarry,1919,1919-11-26,"Biography, Drama, Romance",85,Germany,German,Ernst Lubitsch,"Norbert Falk, Hanns Kräly",Projektions-AG Union (PAGU),"Pola Negri, Emil Jannings, Harry Liedtke, Edua...","The story of Madame DuBarry, the mistress of L...",6.8,753,,,,,12.0,9.0
7,tt0002445,Quo Vadis?,Quo Vadis?,1913,1913-03-01,"Drama, History",120,Italy,Italian,Enrico Guazzoni,"Henryk Sienkiewicz, Enrico Guazzoni",Società Italiana Cines,"Amleto Novelli, Gustavo Serena, Carlo Cattaneo...","An epic Italian film ""Quo Vadis"" influenced ma...",6.2,273,ITL 45000,,,,7.0,5.0
8,tt0002452,Independenta Romaniei,Independenta Romaniei,1912,1912-09-01,"History, War",120,Romania,,"Aristide Demetriade, Grigore Brezeanu","Aristide Demetriade, Petre Liciu",Societatea Filmului de Arta Leon Popescu,"Aristide Demetriade, Constanta Demetriade, Con...",The movie depicts the Romanian War of Independ...,6.7,198,ROL 400000,,,,4.0,1.0
9,tt0002461,Richard III,Richard III,1912,1912-10-15,Drama,55,"France, USA",English,"André Calmettes, James Keane","James Keane, William Shakespeare",Le Film d'Art,"Robert Gemp, Frederick Warde, Albert Gardner, ...",Richard of Gloucester uses manipulation and mu...,5.5,225,$ 30000,,,,8.0,1.0


In [7]:
# Drop some columns
df.drop(["original_title", "description", 'writer', 'production_company', 'actors', 'production_company'], axis=1, inplace=True)

## compare year in database to date published year

In [8]:
# is variable year (object) in the original dataset the same as year calculated from date_published?
# year is an object because one row has non-numeric values in it, so strip out non-numeric characters before running 

df["year"]=df["year"].str.strip()  #remove spaces
form1=r'([\d]{4})'
df["year"]=df["year"].str.extract(f'({form1})')  #extract just year
try:
    pd.to_numeric(df["year"])
except:
    print(f'Cannot convert to numeric imdb_title_id: {df["imdb_title_id"]}')
df["year"] = df["year"].astype('float')
df.dtypes

imdb_title_id             object
title                     object
year                     float64
date_published            object
genre                     object
duration                   int64
country                   object
language                  object
director                  object
avg_vote                 float64
votes                      int64
budget                    object
usa_gross_income          object
worlwide_gross_income     object
metascore                float64
reviews_from_users       float64
reviews_from_critics     float64
dtype: object

In [9]:
df.columns

Index(['imdb_title_id', 'title', 'year', 'date_published', 'genre', 'duration',
       'country', 'language', 'director', 'avg_vote', 'votes', 'budget',
       'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics'],
      dtype='object')

In [10]:
# clean up and change release_date to datetime
# save at each change to make reverting easy
r_date=df["date_published"]
r_date.str.strip()  #remove spaces
form1=r'(^[0-9]{4})'
rel_year = r_date.str.extract(f'({form1})')  #extract just year
rel_year["release_year"]=pd.to_numeric(rel_year[0])
rel_year.drop([0,1],axis=1,inplace=True)
print(rel_year.dtypes)

release_year    float64
dtype: object


In [11]:
print(len(df))
print(len(rel_year))
print(f'\n columns in df  {df.columns}')
print(f'\n columns in rel_year:  {rel_year.columns}')

85855
85855

 columns in df  Index(['imdb_title_id', 'title', 'year', 'date_published', 'genre', 'duration',
       'country', 'language', 'director', 'avg_vote', 'votes', 'budget',
       'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics'],
      dtype='object')

 columns in rel_year:  Index(['release_year'], dtype='object')


In [12]:
df=pd.concat([df, rel_year], axis=1, ignore_index=False, join="inner")
print(df.columns)
print(len(df))

Index(['imdb_title_id', 'title', 'year', 'date_published', 'genre', 'duration',
       'country', 'language', 'director', 'avg_vote', 'votes', 'budget',
       'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics', 'release_year'],
      dtype='object')
85855


In [13]:
df.drop(["date_published"], axis=1, inplace=True)

In [14]:
print(df.head())

  imdb_title_id                        title    year  \
0     tt0000009                   Miss Jerry  1894.0   
1     tt0000574  The Story of the Kelly Gang  1906.0   
2     tt0001892               Den sorte drøm  1911.0   
3     tt0002101                    Cleopatra  1912.0   
4     tt0002130                    L'Inferno  1911.0   

                       genre  duration           country language  \
0                    Romance        45               USA     None   
1    Biography, Crime, Drama        70         Australia     None   
2                      Drama        53  Germany, Denmark      NaN   
3             Drama, History       100               USA  English   
4  Adventure, Drama, Fantasy        68             Italy  Italian   

                              director  avg_vote  votes   budget  \
0                      Alexander Black       5.9    154      NaN   
1                         Charles Tait       6.1    589   $ 2250   
2                            Urban Gad      

In [15]:
# compare year and release_year
df["year"].equals(df["release_year"])

False

In [16]:
# Find the differences, many appear to be off by one year
df["not_equal"] = df["year"].ne(df["release_year"])
df["not_equal"].sum()

17819

In [17]:
print(df[df["not_equal"] == True])

      imdb_title_id                                              title  \
5         tt0002199  From the Manger to the Cross; or, Jesus of Naz...   
12        tt0003014                           Il calvario di una madre   
21        tt0003637                                      Assunta Spina   
28        tt0003973                              A Florida Enchantment   
65        tt0006517                                       Civilization   
...             ...                                                ...   
85802     tt9845110                                               Deux   
85803     tt9845398                                       Fin de siglo   
85806     tt9850264          Bruno Manser - Die Stimme des Regenwaldes   
85837     tt9894470                                                VFW   
85854     tt9914942                         La vida sense la Sara Amat   

         year                        genre  duration  \
5      1912.0             Biography, Drama        60   

In [18]:
print(f'null rows in release year: {df["release_year"].isnull().sum()}')
print(f'null rows in year: {df["year"].isnull().sum()}')

null rows in release year: 1
null rows in year: 0


In [19]:
df["year"].value_counts().sort_index(ascending=True)

1894.0       1
1906.0       1
1911.0       5
1912.0       5
1913.0      13
          ... 
2016.0    3138
2017.0    3329
2018.0    3257
2019.0    2842
2020.0     789
Name: year, Length: 112, dtype: int64

In [20]:
df["release_year"].value_counts().sort_index(ascending=True)

1894.0       1
1906.0       1
1911.0       5
1912.0       4
1913.0      13
          ... 
2017.0    3355
2018.0    3364
2019.0    3120
2020.0    1339
2021.0       7
Name: release_year, Length: 113, dtype: int64

## drop movies released before 1970

In [21]:
rel_year=rel_year[rel_year["release_year"] > 1969]
year_counts=rel_year["release_year"].value_counts()
print('year counts:')
print(year_counts)

year counts:
2018.0    3364
2017.0    3355
2016.0    3153
2019.0    3120
2015.0    2971
2014.0    2909
2013.0    2771
2012.0    2492
2011.0    2419
2009.0    2252
2010.0    2248
2008.0    2207
2007.0    2066
2006.0    2028
2005.0    1830
2004.0    1636
2003.0    1535
2002.0    1365
2000.0    1360
2001.0    1352
1999.0    1346
2020.0    1339
1998.0    1030
1997.0    1011
1996.0     949
1989.0     917
1995.0     908
1988.0     904
1993.0     865
1987.0     821
1994.0     813
1991.0     791
1992.0     791
1990.0     788
1986.0     754
1982.0     679
1973.0     667
1984.0     661
1972.0     661
1985.0     656
1981.0     643
1983.0     641
1976.0     636
1971.0     623
1979.0     620
1977.0     615
1974.0     609
1975.0     595
1980.0     588
1970.0     586
1978.0     582
2021.0       7
Name: release_year, dtype: int64


In [22]:
print(len(rel_year))
df.drop(["release_year", "not_equal"], axis=1, inplace=True)
df = pd.concat([df, rel_year], axis=1, ignore_index=False, join="inner")
len(df)

70529


70529

## drop some columns

In [23]:
df.columns

Index(['imdb_title_id', 'title', 'year', 'genre', 'duration', 'country',
       'language', 'director', 'avg_vote', 'votes', 'budget',
       'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics', 'release_year'],
      dtype='object')

## Languages spoken

In [24]:
# Language coded differently than in TMDB 2 letter codes vs. spelled out
# TMDB has original language and IMDB has all languages spoken
pd.set_option("max_rows", None)
df["language"].value_counts()
# Look at how to combine with TMDB file

English                                                                                                                                                                27205
French                                                                                                                                                                  3229
Spanish                                                                                                                                                                 2535
Japanese                                                                                                                                                                2355
Italian                                                                                                                                                                 2055
Hindi                                                                                                                                  

### perhaps adapt to handle different encoding

In [25]:
# This cell applies to TMDB file only??
# # Keep the top languages and set all others to "other"
# orig_lang_cd=movies_year_df["original_language"]

# def recode_lang(y):
#     if (y in ["en", "fr", "it", "ja", "de", "es", "ru"]):
#         return y
#     else:
#         return "other"
# orig_lang_cd=orig_lang_cd.map(recode_lang)
# orig_lang_cd.name="orig_lang_cd"
# print(f'Original language counts: {orig_lang_cd.value_counts()}')
# print(f'movie count original language: {len(orig_lang_cd)}')
# print(f'movie count movies_year_df: {len(movies_year_df)}')

# # Update original language - default join is outer
# movies_year_df= pd.concat([movies_year_df, orig_lang_cd], axis=1, ignore_index=False, join="inner")
# print(len(movies_year_df)) 

# # drop original column
# movies_year_df.drop(["original_language"], axis=1, inplace=True)

## Look at object columns for mixed data types

In [26]:
# Look into
# genre                     object -> list
# duration                   int64
# country                   object 
# language                  object
# director                  object
# writer                    object
# production_company        object
# actors                    object
# avg_vote                 float64
# votes                      int64
# budget                    object
# usa_gross_income          object
# worlwide_gross_income     object
# metascore                float64
# reviews_from_users       float64
# reviews_from_critics     float64


In [27]:
pd.reset_option("max_rows")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70529 entries, 175 to 85854
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          70529 non-null  object 
 1   title                  70529 non-null  object 
 2   year                   70529 non-null  float64
 3   genre                  70529 non-null  object 
 4   duration               70529 non-null  int64  
 5   country                70465 non-null  object 
 6   language               69913 non-null  object 
 7   director               70445 non-null  object 
 8   avg_vote               70529 non-null  float64
 9   votes                  70529 non-null  int64  
 10  budget                 21107 non-null  object 
 11  usa_gross_income       14632 non-null  object 
 12  worlwide_gross_income  30375 non-null  object 
 13  metascore              12858 non-null  float64
 14  reviews_from_users     63624 non-null  float64
 15  

In [28]:
df.describe()
# min movie rating counts=99, which lines up with the description on the use of GroupsLens ratings
# Duration appears to be in minutes, but there are very large numbers
# spot-checked a few of the longest durations and they were all correct
# checked a few reviews counts compared to IMDB and they were reasonably close

Unnamed: 0,year,duration,avg_vote,votes,metascore,reviews_from_users,reviews_from_critics,release_year
count,70529.0,70529.0,70529.0,70529.0,12858.0,63624.0,60360.0,70529.0
mean,2002.838223,102.113414,5.790186,10933.3,55.158812,50.771376,30.539231,2003.24217
std,13.745638,21.708421,1.275937,58369.95,17.429233,195.785529,63.338663,13.620838
min,1919.0,41.0,1.0,99.0,1.0,1.0,1.0,1970.0
25%,1994.0,90.0,5.0,212.0,43.0,3.0,3.0,1995.0
50%,2007.0,97.0,6.0,517.0,56.0,9.0,9.0,2007.0
75%,2014.0,110.0,6.7,2071.0,68.0,27.0,26.0,2014.0
max,2020.0,808.0,9.9,2278845.0,100.0,10472.0,999.0,2021.0


## Genres

In [29]:
# example: Drama, Fantasy, Horror, max genres per movie
# each row is a string, convert to a list

pd.set_option('max_colwidth', None)
pd.set_option('max_columns', None)
print(f'\n sample genre records: {df["genre"].head()}')
genre_string=df["genre"]    # genres in a string
genre_list = genre_string.str.split(',')   # outputs split words to a string
print(f'\n print genre_list: {genre_list}')
print(f'\n genre_combinations: {genre_list.value_counts()}')
print(f'\n object type: {type(genre_list)}')
genre_cds = genre_string.str.split(',', expand=True)   # outputs split words to separate columns
genre_cds.columns=["A", "B", "C"]
genre_cds["A"] = genre_cds["A"].str.strip()
genre_cds["B"] = genre_cds["B"].str.strip()
genre_cds["C"] = genre_cds["C"].str.strip()
print(f'\n print genre_cds: {genre_cds}')

# get a list of unique genres
new_cols=pd.Series(['Action',
'Adult',
'Adventure',
'Animation',
'Biography',
'Comedy',
'Crime',
'Documentary',
'Drama',
'Family',
'Fantasy', 
'History',
'Horror',
'Music',
'Musical',
'Mystery',
'News',
'Reality-TV',
'Romance',
'Sci-Fi',
'Sport',
'Thriller',
'War',
'Western'
])
new_cols.str.strip()
print(f'\n list of genres in new_cols: {new_cols}')
print(f'\n type(new_cols): {type(new_cols)}')
print(f'\n columns: {new_cols}')
print(f'\n columns in df: {df.columns}')


 sample genre records: 175               Drama, Romance
305       Drama, Fantasy, Horror
560               Drama, Romance
569    Biography, Drama, History
624        Comedy, Drama, Family
Name: genre, dtype: object

 print genre_list: 175                  [Drama,  Romance]
305         [Drama,  Fantasy,  Horror]
560                  [Drama,  Romance]
569      [Biography,  Drama,  History]
624          [Comedy,  Drama,  Family]
                     ...              
85850                         [Comedy]
85851                 [Comedy,  Drama]
85852                          [Drama]
85853                 [Drama,  Family]
85854                          [Drama]
Name: genre, Length: 70529, dtype: object

 genre_combinations: [Drama]                          10747
[Comedy]                          6341
[Comedy,  Drama]                  3642
[Drama,  Romance]                 2718
[Horror]                          2115
                                 ...  
[Action,  Horror,  Adventure]        

In [30]:
print(f'\n columns in df:  {df.columns}')
print(f'\n columns in {genre_list}')


 columns in df:  Index(['imdb_title_id', 'title', 'year', 'genre', 'duration', 'country',
       'language', 'director', 'avg_vote', 'votes', 'budget',
       'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics', 'release_year'],
      dtype='object')

 columns in 175                  [Drama,  Romance]
305         [Drama,  Fantasy,  Horror]
560                  [Drama,  Romance]
569      [Biography,  Drama,  History]
624          [Comedy,  Drama,  Family]
                     ...              
85850                         [Comedy]
85851                 [Comedy,  Drama]
85852                          [Drama]
85853                 [Drama,  Family]
85854                          [Drama]
Name: genre, Length: 70529, dtype: object


In [31]:
# add genre_list to df
df.drop(["genre"], axis=1, inplace=True)
df = pd.concat([df, genre_list], axis=1, ignore_index=False, join="outer")
print(f'\n columns in df: {df.columns}')
df.rename({"genre" : "genre_list"}, axis=1, inplace=True)


 columns in df: Index(['imdb_title_id', 'title', 'year', 'duration', 'country', 'language',
       'director', 'avg_vote', 'votes', 'budget', 'usa_gross_income',
       'worlwide_gross_income', 'metascore', 'reviews_from_users',
       'reviews_from_critics', 'release_year', 'genre'],
      dtype='object')


In [32]:
print(df.head())

    imdb_title_id                   title    year  duration    country  \
175     tt0010680   The Sentimental Bloke  1919.0       106  Australia   
305     tt0013579       Ombre ammonitrici  1923.0        90    Germany   
560     tt0017938  La glace à trois faces  1983.0        45     France   
569     tt0018054            Il re dei re  1927.0       160        USA   
624     tt0018742            Il cameraman  1928.0        76        USA   

         language                        director  avg_vote  votes     budget  \
175       English                Raymond Longford       6.3    237        NaN   
305        German                  Arthur Robison       6.8    842        NaN   
560  None, French                    Jean Epstein       7.0    759        NaN   
569       English                Cecil B. DeMille       7.2   1890  $ 2500000   
624       English  Edward Sedgwick, Buster Keaton       8.1  10101        NaN   

    usa_gross_income worlwide_gross_income  metascore  reviews_from_

In [33]:
print(f'length of df: {len(df)}')

length of df: 70529


In [34]:
print(f'columns in df: {df.columns}')

columns in df: Index(['imdb_title_id', 'title', 'year', 'duration', 'country', 'language',
       'director', 'avg_vote', 'votes', 'budget', 'usa_gross_income',
       'worlwide_gross_income', 'metascore', 'reviews_from_users',
       'reviews_from_critics', 'release_year', 'genre_list'],
      dtype='object')


In [35]:
# Create df of genre indicators for each movie
# create empty data frame with index from df and columns = genre_cds
# pre-fill with zeroes

genre_result_df = pd.DataFrame(0, index=df.index.copy(), columns=new_cols)
print(f'\n genre_result_df {genre_result_df.head()}')

# To iterate over rows of a Pandas DataFrame, use DataFrame.iterrows() function which returns an iterator 
# yielding index and row data for each row.
# DataFrame.iterrows(self)
# index – index of the row in DataFrame. This could be a label for single index, or tuple of label for multi-index.
# data – data is the row data as Pandas Series.
# it – it is the generator that iterates over the rows of DataFrame.

# genre_cds: columns: "A", "B", "C"
# index is class int
# row is class series
# row contains up to 3 genres

# getting None in list, which is getting converted to NaN. check for class=nonetype

for index, row  in genre_cds.iterrows():
#     print(f"\n index: {row['A']}, and {row['B']} and {row['C']}")
#     print(type(index), type(row))
#    use column names to access the value in each row
#     print(f'\n original series index: {index}')
#     print(f'\n genres: {row["A"]},  {row["B"]},  {row["C"]}')
#     print(f'\n row A: {type(row["A"])}')
#     print(f'\n check A = nonetype?:  {row["A"] is None}')
#     print(f' row B: {type(row["B"])}')
#     print(f'\n check B is nonetype?: {row["B"] is None}')
#     print(f' row C: {type(row["C"])}')
#     print(f'\n check C is nonetype?: {row["C"] is None}')
#    write to the initialized dataframe
    if row["A"] is not None:      
        genre_result_df.loc[index, row["A"]] = 1
    if row["B"] is not None:
        genre_result_df.loc[index, row["B"]] = 1
    if row["C"] is not None:
        genre_result_df.loc[index, row["C"]] = 1
#     print(f'\n result df: {genre_result_df.loc[index,:]}')

# rename genre columns so all start with g_
cols=genre_result_df.columns
new_cols=[]
for col in cols:
    new = 'g_' + col
    new_cols.append(new)
genre_result_df.columns = new_cols

print(f'\n result df: {genre_result_df.head()}')

# count number genres by movie:
pd.set_option("max_columns", None)
number_genres=genre_result_df.sum(axis=1, min_count=1)

print(f'\n genre columns: {genre_result_df.columns}')
print(f'\n number of genres: {number_genres.head(10)}')
print(f'\n frequencies of number of genres per movie {number_genres.value_counts()}')
print(f'\n frequency of genre combos: {genre_result_df.value_counts()}')

print(f'\n result df: {genre_result_df.head()}')


 genre_result_df      Action  Adult  Adventure  Animation  Biography  Comedy  Crime  \
175       0      0          0          0          0       0      0   
305       0      0          0          0          0       0      0   
560       0      0          0          0          0       0      0   
569       0      0          0          0          0       0      0   
624       0      0          0          0          0       0      0   

     Documentary  Drama  Family  Fantasy  History  Horror  Music  Musical  \
175            0      0       0        0        0       0      0        0   
305            0      0       0        0        0       0      0        0   
560            0      0       0        0        0       0      0        0   
569            0      0       0        0        0       0      0        0   
624            0      0       0        0        0       0      0        0   

     Mystery  News  Reality-TV  Romance  Sci-Fi  Sport  Thriller  War  Western  
175        0     

In [36]:
# count number of movies by genre
genre_result_df.agg(sum, axis=0)

g_Action         11424
g_Adult              2
g_Adventure       5528
g_Animation       2045
g_Biography       1978
g_Comedy         24372
g_Crime           8587
g_Documentary        2
g_Drama          38762
g_Family          3444
g_Fantasy         3269
g_History         1706
g_Horror          8709
g_Music           1248
g_Musical          986
g_Mystery         4166
g_News               1
g_Reality-TV         3
g_Romance        10641
g_Sci-Fi          3106
g_Sport            916
g_Thriller       10508
g_War             1289
g_Western          565
dtype: int64

In [37]:
# append genre indicators to clean_df
print(len(df))
print(len(genre_result_df))
df = pd.concat([df, genre_result_df], axis=1, ignore_index=False, join="outer")


70529
70529


In [38]:
print(len(df))

70529


In [39]:
print(df.columns)

Index(['imdb_title_id', 'title', 'year', 'duration', 'country', 'language',
       'director', 'avg_vote', 'votes', 'budget', 'usa_gross_income',
       'worlwide_gross_income', 'metascore', 'reviews_from_users',
       'reviews_from_critics', 'release_year', 'genre_list', 'g_Action',
       'g_Adult', 'g_Adventure', 'g_Animation', 'g_Biography', 'g_Comedy',
       'g_Crime', 'g_Documentary', 'g_Drama', 'g_Family', 'g_Fantasy',
       'g_History', 'g_Horror', 'g_Music', 'g_Musical', 'g_Mystery', 'g_News',
       'g_Reality-TV', 'g_Romance', 'g_Sci-Fi', 'g_Sport', 'g_Thriller',
       'g_War', 'g_Western'],
      dtype='object')


In [40]:
# Parse directors into a list
# pd.set_option('max_colwidth', None)
# pd.set_option('max_columns', None)
df["director_list"] = df["director"].str.split(',')
df["director_list"]
df.drop(["director"], axis=1, inplace=True)
df.head()

Unnamed: 0,imdb_title_id,title,year,duration,country,language,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics,release_year,genre_list,g_Action,g_Adult,g_Adventure,g_Animation,g_Biography,g_Comedy,g_Crime,g_Documentary,g_Drama,g_Family,g_Fantasy,g_History,g_Horror,g_Music,g_Musical,g_Mystery,g_News,g_Reality-TV,g_Romance,g_Sci-Fi,g_Sport,g_Thriller,g_War,g_Western,director_list
175,tt0010680,The Sentimental Bloke,1919.0,106,Australia,English,6.3,237,,,,,11.0,10.0,2005.0,"[Drama, Romance]",0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,[Raymond Longford]
305,tt0013579,Ombre ammonitrici,1923.0,90,Germany,German,6.8,842,,,,,15.0,16.0,2016.0,"[Drama, Fantasy, Horror]",0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,[Arthur Robison]
560,tt0017938,La glace à trois faces,1983.0,45,France,"None, French",7.0,759,,,,,7.0,4.0,1983.0,"[Drama, Romance]",0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,[Jean Epstein]
569,tt0018054,Il re dei re,1927.0,160,USA,English,7.2,1890,$ 2500000,,,,48.0,24.0,2004.0,"[Biography, Drama, History]",0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,[Cecil B. DeMille]
624,tt0018742,Il cameraman,1928.0,76,USA,English,8.1,10101,,,$ 1737460,,61.0,49.0,1970.0,"[Comedy, Drama, Family]",0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[Edward Sedgwick, Buster Keaton]"


In [41]:
# Clean up columns
print(f'\n columns in df: {df.columns}')
df.rename({"imdb_title_id" : "imdb_id"}, axis=1, inplace=True)


 columns in df: Index(['imdb_title_id', 'title', 'year', 'duration', 'country', 'language',
       'avg_vote', 'votes', 'budget', 'usa_gross_income',
       'worlwide_gross_income', 'metascore', 'reviews_from_users',
       'reviews_from_critics', 'release_year', 'genre_list', 'g_Action',
       'g_Adult', 'g_Adventure', 'g_Animation', 'g_Biography', 'g_Comedy',
       'g_Crime', 'g_Documentary', 'g_Drama', 'g_Family', 'g_Fantasy',
       'g_History', 'g_Horror', 'g_Music', 'g_Musical', 'g_Mystery', 'g_News',
       'g_Reality-TV', 'g_Romance', 'g_Sci-Fi', 'g_Sport', 'g_Thriller',
       'g_War', 'g_Western', 'director_list'],
      dtype='object')


In [42]:
# Save to csv
print(df.info)
file_path ="Resources/imdb_main.csv"
df.to_csv(file_path,index=False)

<bound method DataFrame.info of          imdb_id                           title    year  duration  \
175    tt0010680           The Sentimental Bloke  1919.0       106   
305    tt0013579               Ombre ammonitrici  1923.0        90   
560    tt0017938          La glace à trois faces  1983.0        45   
569    tt0018054                    Il re dei re  1927.0       160   
624    tt0018742                    Il cameraman  1928.0        76   
...          ...                             ...     ...       ...   
85850  tt9908390                         Le lion  2020.0        95   
85851  tt9911196  De Beentjes van Sint-Hildegard  2020.0       103   
85852  tt9911774       Padmavyuhathile Abhimanyu  2019.0       130   
85853  tt9914286               Sokagin Çocuklari  2019.0        98   
85854  tt9914942      La vida sense la Sara Amat  2019.0        74   

               country       language  avg_vote  votes     budget  \
175          Australia        English       6.3    237    

In [43]:
# print(genre_result_df.info)
# file_path ="Resources/imdb_genres.csv"
# genre_result_df.to_csv(file_path,index=False)

<bound method DataFrame.info of        g_Action  g_Adult  g_Adventure  g_Animation  g_Biography  g_Comedy  \
175           0        0            0            0            0         0   
305           0        0            0            0            0         0   
560           0        0            0            0            0         0   
569           0        0            0            0            1         0   
624           0        0            0            0            0         1   
...         ...      ...          ...          ...          ...       ...   
85850         0        0            0            0            0         1   
85851         0        0            0            0            0         1   
85852         0        0            0            0            0         0   
85853         0        0            0            0            0         0   
85854         0        0            0            0            0         0   

       g_Crime  g_Documentary  g_Drama  g_F

## come back to
## budget, usa_gross_income, worldwide_gross_income
### many contain characters or symbols indicating currency. Is that impacting the null count?

In [None]:
# some of the money variables contain text, likely different currencies
# some of the values are boolean
# print(f'datatypes in budget column: {df["budget"].apply(type).value_counts()}')
# print(f'datatypes in usa_gross_income column: {df["usa_gross_income"].apply(type).value_counts()}')
# print(f'datatypes in worldwide_gross_income column: {df["worldwide_gross_income"].apply(type).value_counts()}')    

In [None]:
# check contents of string values
# found $ and alpha characters in front of the number. Need to split into different fields

##### use code from extracting countries in other notebook- but do genres first
