In [184]:
import pandas as pd

# Das DataFrame Objekt

In [185]:
ds_guests = pd.read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/daily-show-guests/daily_show_guests.csv", 
                                encoding="unicode_escape", header=0)

In [186]:
ds_guests.head()

Unnamed: 0,YEAR,GoogleKnowlege_Occupation,Show,Group,Raw_Guest_List
0,1999,actor,1/11/99,Acting,Michael J. Fox
1,1999,Comedian,1/12/99,Comedy,Sandra Bernhard
2,1999,television actress,1/13/99,Acting,Tracey Ullman
3,1999,film actress,1/14/99,Acting,Gillian Anderson
4,1999,actor,1/18/99,Acting,David Alan Grier


In [187]:
#By default rename creates a new DataFrame, remember to use inplace=True if you do not want to create a new object
ds_guests.rename(columns={'Raw_Guest_List':'Guest',
                         'GoogleKnowlege_Occupation':'Occupation',
                         'Show':'Date'}, inplace=True)

In [188]:
ds_guests.head()

Unnamed: 0,YEAR,Occupation,Date,Group,Guest
0,1999,actor,1/11/99,Acting,Michael J. Fox
1,1999,Comedian,1/12/99,Comedy,Sandra Bernhard
2,1999,television actress,1/13/99,Acting,Tracey Ullman
3,1999,film actress,1/14/99,Acting,Gillian Anderson
4,1999,actor,1/18/99,Acting,David Alan Grier


In [189]:
ds_guests.columns

Index(['YEAR', 'Occupation', 'Date', 'Group', 'Guest'], dtype='object')

In [190]:
ds_guests.index

RangeIndex(start=0, stop=2693, step=1)

In [191]:
ds_guests.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2693 entries, 0 to 2692
Data columns (total 5 columns):
YEAR          2693 non-null int64
Occupation    2667 non-null object
Date          2693 non-null object
Group         2662 non-null object
Guest         2693 non-null object
dtypes: int64(1), object(4)
memory usage: 105.3+ KB


## Selektion (Teil 1)

In [192]:
#Attribute access (special consideration for naming e.g.: min, index, and whitespaces)
ds_guests.Group.head()

0    Acting
1    Comedy
2    Acting
3    Acting
4    Acting
Name: Group, dtype: object

In [193]:
#Standard Access
ds_guests["Group"].head()

0    Acting
1    Comedy
2    Acting
3    Acting
4    Acting
Name: Group, dtype: object

In [196]:
#Mehrere Spalten
ds_guests[["Group", "Date"]].head()

Unnamed: 0,Group,Date
0,Acting,1/11/99
1,Comedy,1/12/99
2,Acting,1/13/99
3,Acting,1/14/99
4,Acting,1/18/99


In [197]:
#Integer-location index
ds_guests["Group"].iloc[1:5]

1    Comedy
2    Acting
3    Acting
4    Acting
Name: Group, dtype: object

In [198]:
# Wie auch bei Series Objekten kann eine Selektion ebenfalls über eine True/False Serie erfolgen  
ds_guests[ds_guests["Group"] == "Acting"].head()

Unnamed: 0,YEAR,Occupation,Date,Group,Guest
0,1999,actor,1/11/99,Acting,Michael J. Fox
2,1999,television actress,1/13/99,Acting,Tracey Ullman
3,1999,film actress,1/14/99,Acting,Gillian Anderson
4,1999,actor,1/18/99,Acting,David Alan Grier
5,1999,actor,1/19/99,Acting,William Baldwin


In [199]:
# Über die True/False Selektion können auch Konditionen abgebildet werden  
ds_guests[(ds_guests["Group"] == "Acting") & (ds_guests["Occupation"] == "actor")].head()

Unnamed: 0,YEAR,Occupation,Date,Group,Guest
0,1999,actor,1/11/99,Acting,Michael J. Fox
4,1999,actor,1/18/99,Acting,David Alan Grier
5,1999,actor,1/19/99,Acting,William Baldwin
8,1999,actor,1/25/99,Acting,Matthew Lillard
11,1999,actor,1/28/99,Acting,D. L. Hughley


In [200]:
# Über die Kombination unique und (Array-) Length lassen sich alle disjunkten Elemente zählen 
len(ds_guests.Occupation.unique())

399

In [201]:
len(ds_guests.Group.unique())

18

## DataFrame Modifikationen

In [202]:
#Same applies to drop
ds_guests.drop(columns=['YEAR'], inplace=True)

In [203]:
#Create a new column which uses the number of whitespaces to estimate the number of forenames 
ds_guests["Forename Number"] = ds_guests.Guest.apply(lambda guest: (len(guest.split(" ")) - 1))

In [204]:
#Get descriptive statistics for the number of forenames
ds_guests["Forename Number"].describe()

count    2693.000000
mean        1.242481
std         0.893401
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max        15.000000
Name: Forename Number, dtype: float64

### What guest name has 15 forenames?

In [205]:
ds_guests[ds_guests["Forename Number"] == 15].Guest.values

array(['Iraq - A Look Baq (or how we learned to stop reporting and love the war)',
       'Iraq - A Look Baq (or how we learned to stop reporting and love the war)'],
      dtype=object)

In [206]:
#Filter out the rows with the the identified 16 word string 
ds_guests = ds_guests[ds_guests.Guest != 'Iraq - A Look Baq (or how we learned to stop reporting and love the war)']

In [207]:
ds_guests["Forename Number"].describe()

count    2691.000000
mean        1.232256
std         0.811131
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max        12.000000
Name: Forename Number, dtype: float64

## Aufgaben:

    - Welcher Gast hat 12 Vornamen?
    - Identifiziere anhand der Vornamenanzahl alle Gast Namen, die keine natürlichen Personen darstellen (setze 5 als obere Grenze für die Vornahmenanzahl) ?
    - Gibt es Duplikate in unserem Datensatz? Nutze hierzu die Funktion "duplicated" und oder "drop_duplicates".
    - Welcher Gast gehört keiner Gruppe an? Welcher Gast gehört keiner Occupation an? Nutze hierzu die Funktion pd.isnull und oder pd.notnull
    - Welche Spalte bietet sich als Index an?
    - Wieviele unterschiedliche Gäste gab es in der Gruppe ("Group") Politician ?
    - Wann war Vin Diesel zu Gast?
    - Wann und wie oft war Will Ferrell zu Gast? Tipp: Er war öfter als 10 mal da.
        

    

## Selektion (Teil 2)

In [111]:
ds_guests["Date"] = pd.to_datetime(ds_guests.Date)
ds_guests.set_index("Date", inplace=True)

In [112]:
#Index basierter Zugriff via loc
ds_guests.loc['2011-08-10']

Unnamed: 0_level_0,Occupation,Group,Guest,Forename Number
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-08-10,lawyer,Misc,John Coffee,1


In [113]:
#Datum kann auch via Default Notation verwendet werden
ds_guests['2011-08-10']

Unnamed: 0_level_0,Occupation,Group,Guest,Forename Number
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-08-10,lawyer,Misc,John Coffee,1


In [114]:
ds_guests['2010':'2011']

Unnamed: 0_level_0,Occupation,Group,Guest,Forename Number
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-11,lawyer,Misc,John Yoo,1
2010-01-12,Editor,Media,Paul Ingrassia,1
2010-01-13,musician,Musician,Ringo Starr and The Ben Harper Band,6
2010-01-13,rock band,Musician,Ringo Starr and The Ben Harper Band,6
2010-01-14,Journalist,Media,Tom Brokaw,1
2010-01-18,comptroller of the us,Government,David M. Walker,2
2010-01-19,actor,Acting,Colin Firth,1
2010-01-20,writer,Media,Jim Wallis,1
2010-01-21,actress,Acting,Julie Andrews,1
2010-01-25,busines magnate,Business,Bill Gates,1


## GroupBy

In [121]:
#Native apply functions are sum, mean, max, min, count, first, last
ds_guests.groupby("Group").count()

Unnamed: 0_level_0,Occupation,Guest,Forename Number
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Academic,103,103,103
Acting,930,930,930
Advocacy,24,24,24
Athletics,52,52,52
Business,25,25,25
Clergy,8,8,8
Comedy,150,150,150
Consultant,18,18,18
Government,40,40,40
Media,751,751,751


# Aufgaben

- Erstelle eine Übersicht die angibt wie oft die einzelnen Gäste zu Besuch waren. 
- Wie lauten die Top 3 Gäste pro Kategorie?
- Was waren die Top 3 Kategorien in der Zeitspanne 01.01.2006 - 01.01.2010 die am häufigsten vertreten waren?
- Was waren die Top 3 Kategorien pro Jahr in der gesamten Zeitspanne (Tipp: Verwendet hierzu pd.Grouper)
- Was waren die Top 10 Monate an denen am meisten Besucher zu Gast waren?

