In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
basic = pd.read_excel('Entertainer - Basic Info.xlsx')
breakthrough = pd.read_excel('Entertainer - Breakthrough Info.xlsx')
last_work = pd.read_excel('Entertainer - Last work Info.xlsx')

In [3]:
basic.head()

Unnamed: 0,Entertainer,Gender (traditional),Birth Year
0,Adele,F,1988
1,Angelina Jolie,F,1975
2,Aretha Franklin,F,1942
3,Bette Davis,F,1908
4,Betty White,F,1922


In [4]:
breakthrough.head()

Unnamed: 0,Entertainer,Year of Breakthrough/#1 Hit/Award Nomination,Breakthrough Name,Year of First Oscar/Grammy/Emmy
0,Adele,2008,19,2009.0
1,Angelina Jolie,1999,"Girl, Interrupted",1999.0
2,Aretha Franklin,1967,I Never Loved a Man (The Way I Love You),1968.0
3,Bette Davis,1934,Of Human Bondage,1935.0
4,Betty White,1952,Life with Elilzabeth,1976.0


In [5]:
last_work.head()

Unnamed: 0,Entertainer,Year of Last Major Work (arguable),Year of Death
0,Adele,2016,
1,Angelina Jolie,2016,
2,Aretha Franklin,2014,
3,Bette Davis,1989,1989.0
4,Betty White,2016,


In [3]:
df = basic.merge(breakthrough,on="Entertainer").merge(last_work,on='Entertainer')
df.head()

Unnamed: 0,Entertainer,Gender (traditional),Birth Year,Year of Breakthrough/#1 Hit/Award Nomination,Breakthrough Name,Year of First Oscar/Grammy/Emmy,Year of Last Major Work (arguable),Year of Death
0,Adele,F,1988,2008,19,2009.0,2016,
1,Angelina Jolie,F,1975,1999,"Girl, Interrupted",1999.0,2016,
2,Aretha Franklin,F,1942,1967,I Never Loved a Man (The Way I Love You),1968.0,2014,
3,Bette Davis,F,1908,1934,Of Human Bondage,1935.0,1989,1989.0
4,Betty White,F,1922,1952,Life with Elilzabeth,1976.0,2016,


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70 entries, 0 to 69
Data columns (total 8 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Entertainer                                   70 non-null     object 
 1   Gender (traditional)                          70 non-null     object 
 2   Birth Year                                    70 non-null     int64  
 3   Year of Breakthrough/#1 Hit/Award Nomination  70 non-null     int64  
 4   Breakthrough Name                             70 non-null     object 
 5   Year of First Oscar/Grammy/Emmy               64 non-null     float64
 6   Year of Last Major Work (arguable)            70 non-null     int64  
 7   Year of Death                                 30 non-null     float64
dtypes: float64(2), int64(3), object(3)
memory usage: 4.9+ KB


## No Duplicate Values

In [8]:
df.duplicated().sum()

0

## Entertainer (all unique entertainer)

In [9]:
df['Entertainer'].nunique()

70

## Gender

In [10]:
# Clean values
df['Gender (traditional)'].value_counts()

M    50
F    20
Name: Gender (traditional), dtype: int64

## Birth Year

In [11]:
# also clean
df['Birth Year'].value_counts().head()

1925    3
1954    3
1942    3
1908    3
1901    3
Name: Birth Year, dtype: int64

## Year of Breakthrough/#1 Hit/Award Nomination

In [12]:
# also clean
df['Year of Breakthrough/#1 Hit/Award Nomination'].value_counts().head()

1967    4
1984    3
1934    3
1963    3
1989    2
Name: Year of Breakthrough/#1 Hit/Award Nomination, dtype: int64

## Breakthrough Name

In [13]:
# Same song won by different entertainer how?
df['Breakthrough Name'].value_counts().head()

I Want to Hold Your Hand    2
19                          1
Champion                    1
Mariah Carey                1
Like A Virgin               1
Name: Breakthrough Name, dtype: int64

In [14]:
# Both of these Entertainer are from "The Beetles" band thats why both won awards on the same song
df[df['Breakthrough Name']== 'I Want to Hold Your Hand']

Unnamed: 0,Entertainer,Gender (traditional),Birth Year,Year of Breakthrough/#1 Hit/Award Nomination,Breakthrough Name,Year of First Oscar/Grammy/Emmy,Year of Last Major Work (arguable),Year of Death
37,John Lennon,M,1940,1963,I Want to Hold Your Hand,1965.0,1980,1980.0
57,Paul McCartney,M,1942,1963,I Want to Hold Your Hand,1965.0,2016,


## Year of First Oscar/Grammy/Emmy

In [15]:
df['Year of First Oscar/Grammy/Emmy'].isnull().sum()

6

In [16]:
# wrong data type as well as had some nan values 
df['Year of First Oscar/Grammy/Emmy'].fillna(0,inplace=True)

df['Year of First Oscar/Grammy/Emmy'] = df['Year of First Oscar/Grammy/Emmy'].astype('int')

df['Year of First Oscar/Grammy/Emmy'].replace({0: 'No Awards Won'},inplace=True)

In [17]:
df['Year of First Oscar/Grammy/Emmy'].value_counts().head()

No Awards Won    6
1962             4
1964             3
1976             3
1987             3
Name: Year of First Oscar/Grammy/Emmy, dtype: int64

## Year of Last Major Work (arguable)

In [18]:
df["Year of Last Major Work (arguable)"].value_counts().head()

2016    30
2015     3
2004     3
2014     3
1980     3
Name: Year of Last Major Work (arguable), dtype: int64

## Year of Death

In [19]:
df['Year of Death'].isnull().sum()

40

In [20]:
# wrong data type as well as had some nan values 

df['Year of Death'].fillna(0,inplace=True)

df['Year of Death'] = df['Year of Death'].astype(int)

df['Year of Death'].replace({0:'Not Dead'},inplace=True)

In [21]:
df['Year of Death'].value_counts().head()

Not Dead    40
2016         4
1977         4
2003         3
1990         2
Name: Year of Death, dtype: int64

# Cleaned Data 

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70 entries, 0 to 69
Data columns (total 8 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   Entertainer                                   70 non-null     object
 1   Gender (traditional)                          70 non-null     object
 2   Birth Year                                    70 non-null     int64 
 3   Year of Breakthrough/#1 Hit/Award Nomination  70 non-null     int64 
 4   Breakthrough Name                             70 non-null     object
 5   Year of First Oscar/Grammy/Emmy               70 non-null     object
 6   Year of Last Major Work (arguable)            70 non-null     int64 
 7   Year of Death                                 70 non-null     object
dtypes: int64(3), object(5)
memory usage: 4.9+ KB


# Adding Images To the dataset with the help of Wikipedia API

In [23]:
!pip install wikipedia




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [37]:
import wikipedia
import requests
import json

WIKI_REQUEST = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='

def get_wiki_image(search_term):
    try:
        result = wikipedia.search(search_term, results = 1)
        wikipedia.set_lang('en')
        wkpage = wikipedia.WikipediaPage(title = result[0])
        title = wkpage.title
        response  = requests.get(WIKI_REQUEST+title)
        json_data = json.loads(response.text)
        img_link = list(json_data['query']['pages'].values())[0]['original']['source']
        return img_link        
    except:
        return 0

wiki_image = get_wiki_image('Angelina Jolie')

In [38]:
wiki_image

'https://upload.wikimedia.org/wikipedia/commons/c/c3/Angelina_Jolie_at_the_U.S._Department_of_State_in_Washington%2C_D.C._in_2022.jpg'

In [25]:
df['images'] = df['Entertainer'].apply(lambda x: get_wiki_image(x))

In [58]:
df.head()

Unnamed: 0,Entertainer,Gender (traditional),Birth Year,Year of Breakthrough/#1 Hit/Award Nomination,Breakthrough Name,Year of First Oscar/Grammy/Emmy,Year of Last Major Work (arguable),Year of Death,images
0,Adele,F,1988,2008,19,2009,2016,Not Dead,https://upload.wikimedia.org/wikipedia/commons...
1,Angelina Jolie,F,1975,1999,"Girl, Interrupted",1999,2016,Not Dead,https://upload.wikimedia.org/wikipedia/commons...
2,Aretha Franklin,F,1942,1967,I Never Loved a Man (The Way I Love You),1968,2014,Not Dead,https://upload.wikimedia.org/wikipedia/commons...
3,Bette Davis,F,1908,1934,Of Human Bondage,1935,1989,1989,https://upload.wikimedia.org/wikipedia/commons...
4,Betty White,F,1922,1952,Life with Elilzabeth,1976,2016,Not Dead,https://upload.wikimedia.org/wikipedia/commons...


In [59]:
df.to_csv('Entertainer.csv', index=False)