In [9]:
# Import Dependencies
import pandas as pd

In [10]:
# Create a path to the csv and read it into a Pandas DataFrame
csv_path = "Resources/ted_talks.csv"
ted_df = pd.read_csv(csv_path)

ted_df.head()

Unnamed: 0,comments,description,duration,event,languages,main_speaker,name,title,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,Do schools kill creativity?,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,43,Al Gore,Al Gore: Averting the climate crisis,Averting the climate crisis,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,26,David Pogue,David Pogue: Simplicity sells,Simplicity sells,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,35,Majora Carter,Majora Carter: Greening the ghetto,Greening the ghetto,1697550
4,593,You've never seen data presented like this. Wi...,1190,TED2006,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,The best stats you've ever seen,12005869


In [15]:
# Figure out the minimum and maximum views for a TED Talk
print(ted_df["views"].max())
print(ted_df["views"].min())

47227110
50443


In [20]:
# Create bins in which to place values based upon TED Talk views
bins = (0, 199999, 399999, 599999, 799999, 999999,
        1999999, 2999999, 3999999, 4999999, 50000000)

# Create labels for these bins
group_labels = ["0 to 199k", "200k to 399k", "400k to 599k", "600k to 799k", "800k to 999k", "1mil to 2mil",
                "2mil to 3mil", "3mil to 4mil", "4mil to 5mil", "5mil to 50mil"]

In [21]:
# Slice the data and place it into bins
pd.cut(ted_df["views"], bins=bins, labels=group_labels).head()

0    5mil to 50mil
1     3mil to 4mil
2     1mil to 2mil
3     1mil to 2mil
4    5mil to 50mil
Name: views, dtype: category
Categories (10, object): [0 to 199k < 200k to 399k < 400k to 599k < 600k to 799k ... 2mil to 3mil < 3mil to 4mil < 4mil to 5mil < 5mil to 50mil]

In [22]:
# Place the data series into a new column inside of the DataFrame
ted_df["View Group"] = pd.cut(ted_df["views"], bins, labels=group_labels)
ted_df.head()

Unnamed: 0,comments,description,duration,event,languages,main_speaker,name,title,views,View Group
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,Do schools kill creativity?,47227110,5mil to 50mil
1,265,With the same humor and humanity he exuded in ...,977,TED2006,43,Al Gore,Al Gore: Averting the climate crisis,Averting the climate crisis,3200520,3mil to 4mil
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,26,David Pogue,David Pogue: Simplicity sells,Simplicity sells,1636292,1mil to 2mil
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,35,Majora Carter,Majora Carter: Greening the ghetto,Greening the ghetto,1697550,1mil to 2mil
4,593,You've never seen data presented like this. Wi...,1190,TED2006,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,The best stats you've ever seen,12005869,5mil to 50mil


In [30]:
# Create a GroupBy object based upon "View Group"
ted_group = ted_df.groupby("View Group")
ted_group['views'].count()

View Group
0 to 199k          32
200k to 399k      135
400k to 599k      234
600k to 799k      307
800k to 999k      339
1mil to 2mil     1004
2mil to 3mil      239
3mil to 4mil       93
4mil to 5mil       68
5mil to 50mil      99
Name: views, dtype: int64

In [23]:
# Find how many rows fall into each bin
print(ted_group["comments"].count())

# Get the average of each column within the GroupBy object
ted_group[["comments", "duration", "languages"]].mean()

View Group
0 to 199k          32
200k to 399k      135
400k to 599k      234
600k to 799k      307
800k to 999k      339
1mil to 2mil     1004
2mil to 3mil      239
3mil to 4mil       93
4mil to 5mil       68
5mil to 50mil      99
Name: comments, dtype: int64


Unnamed: 0_level_0,comments,duration,languages
View Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0 to 199k,76.9375,898.1875,4.0625
200k to 399k,81.992593,832.192593,18.785185
400k to 599k,107.162393,870.517094,22.940171
600k to 799k,118.912052,829.039088,24.400651
800k to 999k,119.628319,798.772861,25.678466
1mil to 2mil,168.136454,809.899402,27.899402
2mil to 3mil,299.481172,832.430962,32.807531
3mil to 4mil,360.870968,809.505376,34.258065
4mil to 5mil,507.088235,920.514706,35.720588
5mil to 50mil,650.393939,884.282828,40.252525


In [35]:
summary_df = pd.DataFrame(ted_group['views'].describe())

In [37]:
summary_df = summary_df.reset_index()

In [38]:
summary_df

Unnamed: 0,View Group,count,mean,std,min,25%,50%,75%,max
0,0 to 199k,32.0,149801.0,37781.2,50443.0,124084.0,155296.5,177998.75,197139.0
1,200k to 399k,135.0,322119.1,55168.18,200726.0,288000.5,326663.0,368872.0,399332.0
2,400k to 599k,234.0,503840.3,59470.05,400082.0,449478.25,505111.0,555985.5,599444.0
3,600k to 799k,307.0,705836.7,54308.79,602024.0,659557.0,707788.0,750702.0,799891.0
4,800k to 999k,339.0,902295.9,58176.72,800001.0,852103.0,904520.0,953748.0,999700.0
5,1mil to 2mil,1004.0,1368263.0,264676.9,1000194.0,1138367.25,1332758.0,1558242.5,1999097.0
6,2mil to 3mil,239.0,2390719.0,272713.0,2004123.0,2155786.5,2362727.0,2596972.5,2991225.0
7,3mil to 4mil,93.0,3440350.0,290722.6,3005687.0,3183828.0,3399887.0,3685420.0,3982352.0
8,4mil to 5mil,68.0,4468781.0,295426.5,4016531.0,4201555.5,4479838.5,4693843.5,4984884.0
9,5mil to 50mil,99.0,10838010.0,7371699.0,5006241.0,6480067.5,8218896.0,12597728.0,47227110.0
