In [1]:
import pandas as pd
import numpy as np
import re
import string
from datetime import datetime
from datetime import date

import matplotlib.lines as mlines
import matplotlib.patches as mpatches
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import seaborn as sns

## Search hack on dataframes
### How to use: find(data,"movie_id","equal","tt6723088")

<h3>Search functions available</h3>

<ol>
  <li>Equal</li>
  <li>Not Equal</li>
  <li>Lessser than</li>
  <li>Greater than</li>
  <li>Null</li>    
  <li>Not null</li>
  <li>Contains</li>
  <li>Not contains</li>
  <li>Duplicate</li>
  <li>Index</li>
  <li>Unique Value</li>
  <li>Unique Count</li>
</ol> 

In [2]:
# Useful Search hack for any value in a column


def find(dataFrame,columnName,operation,searchElement):
    operation = operation.lower().translate(str.maketrans('', '', string.whitespace))
    if operation == "equal": 
        return dataFrame.loc[dataFrame[columnName] == searchElement]
    elif operation == "notequal":
        return dataFrame.loc[dataFrame[columnName] != searchElement]
    elif operation == "lesserthan":
        return dataFrame.loc[dataFrame[columnName] < searchElement]
    elif operation == "greaterthan": 
        return dataFrame.loc[dataFrame[columnName] > searchElement]
    elif operation == "null":
        return dataFrame.loc[dataFrame[columnName].isna()] 
    elif operation == "notnull":
        dataFrame.loc[dataFrame[columnName].notnull()]
    elif operation == "contains":
        return dataFrame[dataFrame[columnName].str.contains(searchElement)]
    elif operation == "notcontains":
        return dataFrame[~dataFrame[columnName].str.contains(searchElement)]   
    elif operation == "duplicate":
        return dataFrame[dataFrame.duplicated([columnName])].sort_values(by=[columnName])
    elif operation == "index":
        return dataFrame.index[dataFrame[columnName] == searchElement].tolist()
    elif operation == "uniquevalue":
        return dataFrame[columnName].unique()
    elif operation == "uniquecount":
        return dataFrame[columnName].nunique()
    else:
        print("The operations you can use are listed above! \n")

###  Read data from file path

In [5]:
# fileLocation = 'C:\\Users\\VictorY\\Desktop\\TestData\\stage3FinalProcessedDF_Nov-17-2019.csv'
oldMovieDirectorFileLocation = 'C:\\Yuva\\ITU\\4th Sem\\Thesis\\Data\\Final_Data_Movies_Directors_old.csv'
# stage3 = 'C:\\Yuva\\ITU\\4th Sem\\Thesis\\Data\\stage3FinalProcessedDF_Mar-27-2020.csv'
saveFileToPath = "C:\\Yuva\\ITU\\4th Sem\\Thesis\\Data"

data = pd.read_csv(oldMovieDirectorFileLocation)
# data = pd.read_csv(stage3)


## Data processing - Fill 0 for missing year and N

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325375 entries, 0 to 325374
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   director_ids         325375 non-null  object 
 1   director_names       325375 non-null  object 
 2   movie_id             325375 non-null  object 
 3   title                325375 non-null  object 
 4   review_count_user    325375 non-null  int64  
 5   review_count_critic  325375 non-null  int64  
 6   rating_value         325375 non-null  float64
 7   rating_count         325375 non-null  int64  
 8   date_published       325375 non-null  object 
 9   release_date         325375 non-null  object 
 10  movie_year           325375 non-null  int64  
 11  birthYear            325375 non-null  int64  
 12  primaryProfession    325375 non-null  object 
 13  knownForTitles       325375 non-null  object 
 14  Gender               325375 non-null  object 
dtypes: float64(1), in

In [7]:
# # Reset the missing years in birth and death year of director to 0

# data.loc[data['deathYear'].isna(), 'deathYear'] = 0
# data.loc[data['birthYear'].isna(), 'birthYear'] = 0

In [8]:
print("Unique directors in this dataframe : ",data.director_ids.nunique())

Unique directors in this dataframe :  41629


In [9]:
find(data,"movie_id","uniquecount",0)

311939

In [10]:
data.Gender.value_counts()

Male      271714
Female     53661
Name: Gender, dtype: int64

In [11]:
data.head(3)

Unnamed: 0,director_ids,director_names,movie_id,title,review_count_user,review_count_critic,rating_value,rating_count,date_published,release_date,movie_year,birthYear,primaryProfession,knownForTitles,Gender
0,nm0005690,William K.L. Dickson,tt0000001,Carmencita - spanyol tánc (1894) - IMDb,12,2,5.8,1255,1894-03-10,10 March 1894 (USA) See more »,1894,1860,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",Male
1,nm0721526,Émile Reynaud,tt0000002,A bohóc és kutyái (1892) - IMDb,0,0,6.5,148,1892-10-28,28 October 1892 (France) See more »,1892,1844,director,"tt2184231,tt0000003,tt2184201,tt0413219",Male
2,nm0721526,Émile Reynaud,tt0000003,Szegény Pierrot (1892) - IMDb,12,5,6.6,819,1892-10-28,28 October 1892 (France) See more »,1892,1844,director,"tt2184231,tt0000003,tt2184201,tt0413219",Male


## Functions to save data to file with dataframe name.

In [12]:
# Helper method for save_to_file

def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

In [13]:
# Function: Save Dataframe to CSV

def save_to_file(dataFrame, folderPath):
    today = date.today()
    todayDate = today.strftime("%b-%d-%Y")
#     dataFrame.to_csv(folderPath+ '\\' + get_df_name(dataFrame) + '_' + todayDate  + '.csv',index=False, encoding="utf8" )
    dataFrame.to_csv(folderPath+ '\\' + get_df_name(dataFrame) + '_' + todayDate  + '.csv',index=False, encoding="utf-8-sig" )

## Fetch the first movie year and career year number of each movie

In [23]:
# Dataframe to group the directors and find the minimum movie year which indicates the Career Start year
directorsCareerStartYearDF = data.loc[data.groupby('director_ids')['movie_year'].idxmin()][['movie_year','director_ids']]

# Rename the column Movie Year to CareerStartYear
directorsCareerStartYearDF = directorsCareerStartYearDF.rename(columns={'movie_year':'CareerStartYear'})
directorsCareerStartYearDF

Unnamed: 0,CareerStartYear,director_ids
18844,1946,nm0000005
31916,1961,nm0000008
38019,1967,nm0000009
28499,1957,nm0000010
43836,1973,nm0000018
...,...,...
325306,2015,nm8794368
325318,2016,nm8818507
325330,2016,nm8855422
325362,2016,nm8863860


In [24]:
# Merge the Directors Career Start Year with the original data set
withFirstMovieYearDF = data.merge(directorsCareerStartYearDF, on='director_ids',how = 'left')

# Generate a column with Career Year number. i.e - Movie year - Career Start Year + 1
withFirstMovieYearDF["Career_Year_Number"] = withFirstMovieYearDF.movie_year - withFirstMovieYearDF.CareerStartYear + 1

In [25]:
# withFirstMovieYearDF.loc[withFirstMovieYearDF.Career_Year_Number == 0,'Career_Year_Number'] = 1

In [26]:
withFirstMovieYearDF

Unnamed: 0,director_ids,director_names,movie_id,title,review_count_user,review_count_critic,rating_value,rating_count,date_published,release_date,movie_year,birthYear,primaryProfession,knownForTitles,Gender,CareerStartYear,Career_Year_Number
0,nm0005690,William K.L. Dickson,tt0000001,Carmencita - spanyol tánc (1894) - IMDb,12,2,5.8,1255,1894-03-10,10 March 1894 (USA) See more »,1894,1860,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",Male,1890,5
1,nm0721526,Émile Reynaud,tt0000002,A bohóc és kutyái (1892) - IMDb,0,0,6.5,148,1892-10-28,28 October 1892 (France) See more »,1892,1844,director,"tt2184231,tt0000003,tt2184201,tt0413219",Male,1892,1
2,nm0721526,Émile Reynaud,tt0000003,Szegény Pierrot (1892) - IMDb,12,5,6.6,819,1892-10-28,28 October 1892 (France) See more »,1892,1844,director,"tt2184231,tt0000003,tt2184201,tt0413219",Male,1892,1
3,nm0721526,Émile Reynaud,tt0000004,Egy jó pohár sör (1892) - IMDb,1,0,6.6,90,1892-10-28,28 October 1892 (France) See more »,1892,1844,director,"tt2184231,tt0000003,tt2184201,tt0413219",Male,1892,1
4,nm0005690,William K.L. Dickson,tt0000005,A patkolókovács (1893) - IMDb,18,3,6.2,1499,1893-05-09,9 May 1893 (USA) See more »,1893,1860,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",Male,1890,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325370,nm0267512,Valerie Faris,tt6720730,Red Hot Chili Peppers: Californication (Video ...,0,0,7.3,10,2000-06-05,5 June 2000 (USA) See more »,2000,1958,"director,producer,writer","tt4648296,tt0449059,tt1839492,tt4622512",Female,1986,15
325371,nm1982597,Rihanna,tt6723088,Rihanna: Disturbia (Video 2008) - IMDb,0,0,6.6,14,2008-07-22,22 July 2008 (USA) See more »,2008,1988,"soundtrack,actress,composer","tt1440129,tt2239822,tt1245492,tt2224026",Female,2008,1
325372,nm0000005,Ingmar Bergman,tt6725014,Scenes from a Marriage: Theatrical Version (19...,0,0,7.8,30,1974-09-21,21 September 1974 (USA) See more »,1974,1918,"writer,director,actor","tt0083922,tt0069467,tt0050986,tt0050976",Male,1946,29
325373,nm8895714,Grant Bullert,tt6732168,In the Dark (2016) - IMDb,0,0,5.0,6,,,2016,1996,"writer,cinematographer,producer","tt7440708,tt6732168,tt8400134,tt7924700",Male,2016,1


In [27]:
withFirstMovieYearDF.nunique()

director_ids            41629
director_names          41486
movie_id               311939
title                  311544
review_count_user         987
review_count_critic       534
rating_value               91
rating_count            12362
date_published          33506
release_date           132612
movie_year                131
birthYear                 163
primaryProfession        2386
knownForTitles          41614
Gender                      2
CareerStartYear           128
Career_Year_Number         88
dtype: int64

### Generate inter_event_time column which contains the time between consequtive movies

In [28]:
print("Shape of dataset : ",withFirstMovieYearDF.shape)

# Sort the values by director_ids and movie_year to have a dataset in chronological order of movies released per director
withFirstMovieYearDF = withFirstMovieYearDF.sort_values(by=['director_ids', 'movie_year'],ascending=[True,True])

# Use the diff() function which finds the differences between the value in a current cell compared to the one above
withFirstMovieYearDF['inter_event_time'] = withFirstMovieYearDF.movie_year.diff()

# Group the directors and find the minimum movie year index of each director 
# and replace them with 0 since their values are the first in each group and should be zero
withFirstMovieYearDF.loc[withFirstMovieYearDF.groupby('director_ids')['movie_year'].idxmin(),'inter_event_time'] = None

withFirstMovieYearDF = withFirstMovieYearDF.sort_index()
print("Shape of dataset after adding the inter_event_time : ",withFirstMovieYearDF.shape)

Shape of dataset :  (325375, 17)
Shape of dataset after adding the inter_event_time :  (325375, 18)


In [30]:
withFirstMovieYearDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 325375 entries, 0 to 325374
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   director_ids         325375 non-null  object 
 1   director_names       325375 non-null  object 
 2   movie_id             325375 non-null  object 
 3   title                325375 non-null  object 
 4   review_count_user    325375 non-null  int64  
 5   review_count_critic  325375 non-null  int64  
 6   rating_value         325375 non-null  float64
 7   rating_count         325375 non-null  int64  
 8   date_published       325375 non-null  object 
 9   release_date         325375 non-null  object 
 10  movie_year           325375 non-null  int32  
 11  birthYear            325375 non-null  int32  
 12  primaryProfession    325375 non-null  object 
 13  knownForTitles       325375 non-null  object 
 14  Gender               325375 non-null  object 
 15  CareerStartYear  

In [29]:
# Convert into int years before using for calculations

withFirstMovieYearDF.movie_year = withFirstMovieYearDF.movie_year.astype(int)
withFirstMovieYearDF.birthYear = withFirstMovieYearDF.birthYear.astype(int)

#### Right censoring or Posthumous Recognition

In [None]:
# # Posthumous Recoginition
# # The records which we believe contain Posthumous data. i.e. movies released after n years of directors death
# years_after_death = 2
# dropIncorrectlyRecordedYears = withFirstMovieYearDF.loc[(withFirstMovieYearDF.movie_year - withFirstMovieYearDF.deathYear >= years_after_death) & (withFirstMovieYearDF.deathYear !=0)]
# dropIncorrectlyRecordedYears

# # Drop all values containing movies released after n years as per the above assumption

# withFirstMovieYearDF.drop(dropIncorrectlyRecordedYears.index, inplace= True)

In [31]:
withFirstMovieYearDF.head(3)

Unnamed: 0,director_ids,director_names,movie_id,title,review_count_user,review_count_critic,rating_value,rating_count,date_published,release_date,movie_year,birthYear,primaryProfession,knownForTitles,Gender,CareerStartYear,Career_Year_Number,inter_event_time
0,nm0005690,William K.L. Dickson,tt0000001,Carmencita - spanyol tánc (1894) - IMDb,12,2,5.8,1255,1894-03-10,10 March 1894 (USA) See more »,1894,1860,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",Male,1890,5,1.0
1,nm0721526,Émile Reynaud,tt0000002,A bohóc és kutyái (1892) - IMDb,0,0,6.5,148,1892-10-28,28 October 1892 (France) See more »,1892,1844,director,"tt2184231,tt0000003,tt2184201,tt0413219",Male,1892,1,
2,nm0721526,Émile Reynaud,tt0000003,Szegény Pierrot (1892) - IMDb,12,5,6.6,819,1892-10-28,28 October 1892 (France) See more »,1892,1844,director,"tt2184231,tt0000003,tt2184201,tt0413219",Male,1892,1,0.0


In [32]:
withFirstMovieYearDF.Career_Year_Number.value_counts(dropna=False)

1     50622
3     16007
2     15721
4     15390
5     14545
      ...  
83        1
73        1
85        1
94        1
95        1
Name: Career_Year_Number, Length: 88, dtype: int64

## Director Specific collection

### Dropping Rating value records with None or NA

In [33]:
# dropRatingLessRecords = withFirstMovieYearDF.loc[withFirstMovieYearDF['rating_value'] == 'None']
# len(dropRatingLessRecords)

In [34]:
# # Drop all values containing incorrect recorded years as per the above assumption
# print("Before dropping records : ",len(withFirstMovieYearDF))
# withFirstMovieYearDF.drop(dropRatingLessRecords.index, inplace= True)
# print("After dropping records : ",len(withFirstMovieYearDF))

### Dropping metascore records with None or NA

In [35]:
# withFirstMovieYearDF.Gender.value_counts()

In [36]:
# print(" Movies created by Female percentage before dropping Metascore None records: ",withFirstMovieYearDF.Gender.value_counts()[1]/(withFirstMovieYearDF.Gender.value_counts()[0] + withFirstMovieYearDF.Gender.value_counts()[1]))

In [37]:
# moviesWithMetaScoreDF = withFirstMovieYearDF.copy()
# dropNoneMetaScoreRecords = moviesWithMetaScoreDF.loc[moviesWithMetaScoreDF['metascore'] == 'None']
# print(len(dropNoneMetaScoreRecords))

# # Drop all values containing incorrect recorded years as per the above assumption

# print("Before dropping none Metascore records : ",len(moviesWithMetaScoreDF))
# moviesWithMetaScoreDF.drop(dropNoneMetaScoreRecords.index, inplace= True)
# print("After dropping none Metascorerecords : ",len(moviesWithMetaScoreDF))

In [38]:
# Convert rating_value into float before using for aggregation - withFirstMovieYear along with Metascore None records
withFirstMovieYearDF.rating_value = withFirstMovieYearDF.rating_value.astype(float)
withFirstMovieYearDF.rating_count = withFirstMovieYearDF.rating_count.astype(int)

In [39]:
withFirstMovieYearDF.Gender.value_counts()

Male      271714
Female     53661
Name: Gender, dtype: int64

In [40]:
withFirstMovieYearDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 325375 entries, 0 to 325374
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   director_ids         325375 non-null  object 
 1   director_names       325375 non-null  object 
 2   movie_id             325375 non-null  object 
 3   title                325375 non-null  object 
 4   review_count_user    325375 non-null  int64  
 5   review_count_critic  325375 non-null  int64  
 6   rating_value         325375 non-null  float64
 7   rating_count         325375 non-null  int32  
 8   date_published       325375 non-null  object 
 9   release_date         325375 non-null  object 
 10  movie_year           325375 non-null  int32  
 11  birthYear            325375 non-null  int32  
 12  primaryProfession    325375 non-null  object 
 13  knownForTitles       325375 non-null  object 
 14  Gender               325375 non-null  object 
 15  CareerStartYear  

In [41]:
def roundOfMean(x):
    return round(np.mean(x),2)

In [42]:
# groupByAggregationDirector = {
#     'movies_produced': ('movie_id','count'),
#     'Average_rating_value': ('rating_value',roundOfMean),
#     'Average_rating_count': ('rating_count',roundOfMean),
#     'career_Length': ('Career_Year_Number','max')
# }

# groupByListDirectors = ['director_ids','birthYear','deathYear','primaryProfession','knownForTitles','Gender']

### Choose if you want director specific data for all records or TV Episodes only records or movie only records

In [43]:
safeCopy = withFirstMovieYearDF.copy()

In [45]:
# tv_data = withFirstMovieYearDF[withFirstMovieYearDF.tv_or_movie=='TV'].reset_index(drop = True)
# movie_data = withFirstMovieYearDF[withFirstMovieYearDF.tv_or_movie=='Movie'].reset_index(drop = True)
# withFirstMovieYearDF = movie_data
# # withFirstMovieYearDF = tv_data

## Generate cumulative Director Dataframe 

In [55]:
# Groupby List
groupByListDirectors = ['director_ids']

print("Shape of dataset begore group by : ",withFirstMovieYearDF.shape)

# Group by aggregation to generate columns
cumulativeDirectorDF = withFirstMovieYearDF.groupby(groupByListDirectors) \
                                     .agg({'director_names':'first',
                                           'movie_id':'count',
                                           'rating_value': roundOfMean,
                                           'rating_count': roundOfMean,
                                           'Career_Year_Number':'max',
                                           'Gender':'first',
                                           'birthYear':'first',
                                           'primaryProfession':'first',
                                           'knownForTitles':'first',
                                           'CareerStartYear':'first',
                                           'inter_event_time':'median'
                                          }) \
                                     .reset_index()
# Columns renamed
cumulativeDirectorDF = cumulativeDirectorDF.rename(columns={'movie_id':'movies_produced',
                                                            'rating_value':'Average_rating_value',
                                                            'rating_count':'Average_rating_count',
                                                            'Career_Year_Number':'career_Length',
                                                            'director_names':'director_name',
                                                            'inter_event_time':'median_inter_event_time'
                                                            }
                                                  )

cumulativeDirectorDF['average_movie_per_year'] = round(cumulativeDirectorDF.movies_produced / cumulativeDirectorDF.career_Length,2)

# cumulativeDirectorDF['average_time_between_consecutive_movies'] = round(cumulativeDirectorDF['time_between_consecutive_movies'] / (cumulativeDirectorDF.movies_produced),2)
cumulativeDirectorDF['average_inter_event_time'] = round((cumulativeDirectorDF.career_Length-1) / (cumulativeDirectorDF.movies_produced - 1) ,2)
cumulativeDirectorDF = cumulativeDirectorDF.fillna(0)
# cumulativeDirectorDF[cumulativeDirectorDF.movies_produced>1]
print("Shape of dataset after group by : ",cumulativeDirectorDF.shape)

Shape of dataset begore group by :  (325375, 18)
Shape of dataset after group by :  (41629, 14)


In [48]:
cumulativeDirectorDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41629 entries, 0 to 41628
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   director_ids              41629 non-null  object 
 1   director_name             41629 non-null  object 
 2   movies_produced           41629 non-null  int64  
 3   Average_rating_value      41629 non-null  float64
 4   Average_rating_count      41629 non-null  float64
 5   career_Length             41629 non-null  int64  
 6   Gender                    41629 non-null  object 
 7   birthYear                 41629 non-null  int32  
 8   primaryProfession         41629 non-null  object 
 9   knownForTitles            41629 non-null  object 
 10  CareerStartYear           41629 non-null  int64  
 11  median_inter_event_time   41629 non-null  float64
 12  average_movie_per_year    41629 non-null  float64
 13  average_inter_event_time  41629 non-null  float64
dtypes: flo

## Generate Director Dataframe - Original data with average metascore value

In [None]:
# # Groupby List
# groupByListDirectors = ['director_ids']

# # Group by aggregation to generate columns
# directorDataDFwithMetascore = moviesWithMetaScoreDF.groupby(groupByListDirectors) \
#                                      .agg({'metascore':roundOfMean,
#                                           'movie_id':'count'}) \
#                                      .reset_index()
# # Columns renamed
# directorDataDFwithMetascore = directorDataDFwithMetascore.rename(columns={'metascore':'average_metascore',
#                                                                          'movie_id':'movies_produced_metascore',})
# # Merge the Directors Career Start Year with the original data set
# # directorDataDFwithMetascore = directorDataDFwithMetascore.merge(directorDataDF, on='director_ids',how = 'left')
# directorDataDFwithMetascore = directorDataDF.merge(directorDataDFwithMetascore, on='director_ids',how = 'left')
# directorDataDFwithMetascore['total_metascore'] = round(directorDataDFwithMetascore.average_metascore * directorDataDFwithMetascore.movies_produced_metascore,2)
# directorDataDFwithMetascore

In [50]:
cumulativeDirectorDF.Gender.value_counts()

Male      32638
Female     8991
Name: Gender, dtype: int64

In [51]:
print(" Female Gender percentage of the cumulative dataframe : ",cumulativeDirectorDF.Gender.value_counts()[1]/(cumulativeDirectorDF.Gender.value_counts()[0] + cumulativeDirectorDF.Gender.value_counts()[1]))

 Female Gender percentage of the cumulative dataframe :  0.21597924523769488


In [52]:
cumulativeDirectorDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41629 entries, 0 to 41628
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   director_ids              41629 non-null  object 
 1   director_name             41629 non-null  object 
 2   movies_produced           41629 non-null  int64  
 3   Average_rating_value      41629 non-null  float64
 4   Average_rating_count      41629 non-null  float64
 5   career_Length             41629 non-null  int64  
 6   Gender                    41629 non-null  object 
 7   birthYear                 41629 non-null  int32  
 8   primaryProfession         41629 non-null  object 
 9   knownForTitles            41629 non-null  object 
 10  CareerStartYear           41629 non-null  int64  
 11  median_inter_event_time   41629 non-null  float64
 12  average_movie_per_year    41629 non-null  float64
 13  average_inter_event_time  41629 non-null  float64
dtypes: flo

In [53]:
cumulativeDirectorDF.director_ids.nunique()

41629

In [54]:
save_to_file(cumulativeDirectorDF,saveFileToPath)

# End

# Plotting and Visualization

### Number of movies released along the Career Year Number

In [None]:
# Plotting a bar graph based on the first 20 value counts

def plot_value_count(df,columnName,toCount,plot_title = 'Value count of data'):
    df_columnName_count  = df[columnName].value_counts()
    df_columnName_count = df_columnName_count[:toCount,]
    plt.figure(figsize=(10,5))
    sns.barplot(df_columnName_count.index, df_columnName_count.values, alpha=0.8)
    plt.title(plot_title)
    plt.ylabel('Number of occurence', fontsize=12)
    plt.xlabel(columnName, fontsize=12)
    plt.show()
    

In [None]:
plot_value_count(withFirstMovieYearDF,'Career_Year_Number',20,'Career Year Number and the movies produced')

### Plot features based on Gender

In [None]:
def plot_features_WRT_gender(df,columnCriteriaName,columnFeatureName):
    diction={}
    for i in df[columnCriteriaName].value_counts().index:
        diction["{0}_{1}".format(columnFeatureName,i)] = df[(df[columnCriteriaName] == i)][columnFeatureName]
    for k,v in diction.items():
        sns.distplot(v,label=[k])
        plt.legend(bbox_to_anchor=(1.05, 0.6), loc=2, borderaxespad=0.)
    return pd.DataFrame.from_dict(diction)


In [None]:
x = plot_features_WRT_gender(data,"Gender","rating_value")

In [None]:
x = plot_features_WRT_gender(cumulativeDirectorDF,"Gender","median_inter_event_time")

In [None]:
plot_value_count(cumulativeDirectorDF[cumulativeDirectorDF.Gender == 'Female'],'median_inter_event_time',20)

In [None]:
# # Plotting a bar graph of the number of movies released in each Career Year Number, for the first 20 Career years listed

# career_year_count  = withFirstMovieYearDF['Career_Year_Number'].value_counts()
# career_year_count = career_year_count[:20,]
# plt.figure(figsize=(10,5))
# sns.barplot(career_year_count.index, career_year_count.values, alpha=0.8)
# plt.title('Career Year Number and the movies produced')
# plt.ylabel('Number of Movies', fontsize=12)
# plt.xlabel('Career Year Number', fontsize=12)
# plt.show()

In [None]:
# cumulativeDirectorDF.corr(method = 'pearson')
# cumulativeDirectorDF.corr(method = 'kendall')

In [None]:
cumulativeDirectorDF.columns

In [None]:
# Removing the directors having only n movies produced
n_movies_produced = 1

removedDF = cumulativeDirectorDF[cumulativeDirectorDF.movies_produced> n_movies_produced]

In [None]:
# Plot Log normal Distribution

def log_normal_plot(df,columnName,binSize="auto"):
    df[columnName].hist(bins= binSize,grid=False,density= True)
    plt.xlabel(columnName, fontsize=15)
    plt.ylabel("Frequency",fontsize=15)
    plt.xscale('log')

In [None]:
log_normal_plot(removedDF,"movies_produced")

In [None]:
# Normal distribution
def distribution_fit(df,columnName):
    # Empirical average and variance are computed
    avg = np.mean(df[columnName])
    var = np.var(df[columnName])
    
    # From that, we know the shape of the fitted Gaussian.
    pdf_x = np.linspace(np.min(df[columnName]),np.max(df[columnName]),100)
    pdf_y = 1.0/np.sqrt(2*np.pi*var)*np.exp(-0.5*(pdf_x-avg)**2/var)
    
    # Then we plot :
    plt.figure()
    plt.hist(df[columnName],30,density=True)
    plt.plot(pdf_x,pdf_y,'k--')
    
    blue_patch = mpatches.Patch(color='blue', label= columnName)
    black_dotted = mlines.Line2D([], [], color='black',markersize=15, label='Fit')
    plt.xlabel(columnName)
    plt.legend(handles=[blue_patch,black_dotted])
    plt.show()

In [None]:
distribution_fit(removedDF,"movies_produced")

In [None]:
# def multi_features_distribution_fit_plot(df,dataTypesToInclude):
#     for column in df.select_dtypes(include=['int64','double']).columns:
#         distribution_fit(df,column)
        
# def multi_features_value_counts_plot(df,dataTypesToInclude):
#     for column in df.select_dtypes(include=['int64','double']).columns | ['Gender']:
#         plot_value_count(df,column,20)

In [None]:
def multi_features_plot(df,dataTypesToInclude,plotType = 'value_count'):
    plotType = plotType.lower().translate(str.maketrans('', '', string.whitespace))
    if(plotType == 'value_count'):
        for column in df.select_dtypes(include=['int64','double']).columns | ['Gender']:
            plot_value_count(df,column,100)
    elif(plotType == 'distribution'):
        for column in df.select_dtypes(include=['int64','double']).columns:
            distribution_fit(df,column)
    elif(plotType == 'log_normal'):
        for column in df.select_dtypes(include=['int64','double']).columns:
            log_normal_plot(df,column)
    else:
        print("Oops")

In [None]:
cumulativeDirectorDF.average_movie_per_year.value_counts()

In [None]:
multi_features_plot(removedDF,['int64','double'])
# multi_features_plot(removedDF,['int64','double'],'distribution')
# multi_features_plot(removedDF,['int64','double'],'log_normal')

In [None]:
multi_features_plot(cumulativeDirectorDF,['int64','double'])
# multi_features_plot(directorDataDF,['int64','double'],'distribution')
# multi_features_plot(removedDF,['int64','double'],'log_normal')