diff --git a/pandas.ipynb b/pandas.ipynb index f0a8a33..475ad0f 100644 --- a/pandas.ipynb +++ b/pandas.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -96,9 +96,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([5, 7, 9])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Numpy array addition: \n", "\n", @@ -108,9 +119,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 2, 3, 4, 5, 6])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Numpy array concatenation: \n", "\n", @@ -153,9 +175,108 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_billtipsexsmokerdaytimesize
016.991.01FemaleNoSunDinner2
110.341.66MaleNoSunDinner3
221.013.50MaleNoSunDinner3
323.683.31MaleNoSunDinner2
424.593.61FemaleNoSunDinner4
\n", + "
" + ], + "text/plain": [ + " total_bill tip sex smoker day time size\n", + "0 16.99 1.01 Female No Sun Dinner 2\n", + "1 10.34 1.66 Male No Sun Dinner 3\n", + "2 21.01 3.50 Male No Sun Dinner 3\n", + "3 23.68 3.31 Male No Sun Dinner 2\n", + "4 24.59 3.61 Female No Sun Dinner 4" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "\n", @@ -164,6 +285,28 @@ "tips.head(5)" ] }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.series.Series" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(tips)\n", + "\n", + "type(tips.tip)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -203,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -213,9 +356,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Accessing a certain value via the index\n", "\n", @@ -224,9 +378,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1., 15., -5., nan, 4., 123., 0., 78., 0., 5., -4.])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Note that there are a bunch of attributes.\n", "# .values returns a numpy ndarray of the values! \n", @@ -236,21 +401,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "RangeIndex(start=0, stop=11, step=1)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Take a look at the index. What type is it? \n", "# You convert itto a numpy ndarray by adding \".values\" again!\n", "\n", - "my_series.index" + "my_series.index\n", + "\n", + "#type(my_series.index)\n", + "\n", + "#np.arange([1,2,3])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "om 1.0\n", + "ir 15.0\n", + "os -5.0\n", + "pap NaN\n", + "pas 4.0\n", + "pil 123.0\n", + "io 0.0\n", + "po 78.0\n", + "ulos 0.0\n", + "is 5.0\n", + "best -4.0\n", + "dtype: float64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# You can overwrite the index directly: \n", "\n", @@ -276,9 +478,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(1.0, 1.0)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Series that have string indices can also be accessed via a RangeIndex\n", "# (which is similar to the index of a regular Python list)\n", @@ -288,9 +501,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(1.0, -5.0)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Note that indices can get moved around, by sorting for example!\n", "# iloc gives you the element you would get if the Series\n", @@ -301,7 +525,9 @@ "\n", "x = my_series.sort_values()\n", "\n", - "x[0], x.iloc[0]" + "x\n", + "\n", + "x[0], x.iloc[0] # iloc returns the first item in the column, not the value with index 0" ] }, { @@ -321,13 +547,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": { "slideshow": { "slide_type": "slide" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 True\n", + "1 False\n", + "2 True\n", + "3 True\n", + "dtype: bool\n" + ] + } + ], "source": [ "Series1 = pd.Series([1,3,5,7])\n", "Series2 = pd.Series([0,10,-1,6])\n", @@ -336,6 +574,7 @@ "\n", "Series4 = Series1 > Series2 \n", "\n", + "print(Series4)\n", "# Take a look at the different Series objects!" ] }, @@ -364,17 +603,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": { "slideshow": { "slide_type": "slide" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "nan" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "Series1 = pd.Series([1,10],index=[\"om\",\"iros\"])\n", "Series2 = pd.Series([4,-1],index=[\"pap\",\"as\"])\n", - "Series3 = Series1 + Series2" + "Series3 = Series1 + Series2\n", + "\n", + "#Series2['om']\n", + "Series3.iloc[0]/5" ] }, { @@ -388,6 +641,32 @@ "This aspect makes it very easy to work with series that we have sorted or manipulated otherwise; there is always the address to access a value. This helps prevent accidentally combining values we didn't mean to combine!" ] }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "foo 1\n", + "fog 2\n", + "dtype: int64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_series=pd.Series([1,2,3],index=['foo','bar','baz'])\n", + "\n", + "my_series=pd.Series({'foo':1,'fog':2})\n", + "\n", + "my_series" + ] + }, { "cell_type": "markdown", "metadata": { @@ -414,21 +693,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "om 1.0\n", + "pap NaN\n", + "dtype: float64" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# accesing by list of index labels\n", "\n", "my_series.index = [\"om\",\"ir\",\"os\",\"pap\",\"pas\",\"pil\",\"io\",\"po\",\"ulos\",\"is\",\"best\"]\n", - "x = my_series[[\"om\",\"pap\"]]" + "x = my_series[[\"om\",\"pap\"]]\n", + "\n", + "x" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "om False\n", + "ir False\n", + "os False\n", + "pap False\n", + "pas False\n", + "pil False\n", + "io True\n", + "po False\n", + "ulos True\n", + "is False\n", + "best False\n", + "dtype: bool" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# getting a boolean-valued series by checking a condition\n", "\n", @@ -438,14 +754,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "io 0.0\n", + "ulos 0.0\n", + "dtype: float64\n", + "io 0.0\n", + "ulos 0.0\n", + "dtype: float64\n" + ] + } + ], "source": [ "# Notice the index of x is a SUBSET of the index of \"my_series\"\n", "# This can be useful when needing to relate values back to the original \"my_series\"!\n", "\n", - "x = my_series[choose]" + "x = my_series[choose]\n", + "\n", + "y=my_series[my_series==0]\n", + "\n", + "print(y)\n", + "print(x)" ] }, { @@ -468,16 +802,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "om 1.0\n", + "pas 4.0\n", + "is 5.0\n", + "dtype: float64" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge: \n", "\n", "# Filter \"my_series\" to be all the elements that are NOT\n", "# equal to 0, using the \"choose\" boolean mask below: \n", "\n", - "choose = my_series == 0.0\n" + "choose = my_series == 0.0\n", + "\n", + "x=my_series[~choose]\n", + "\n", + "x\n", + "\n", + "my_series[(my_series<10.)& (my_series>0.)]\n" ] }, { @@ -543,9 +897,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0 foo\n", + "1 bar\n", + "3 baz\n", + "4 qux\n", + "dtype: object" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge: \n", "# Get a list of names, without the Null values!\n", @@ -554,7 +923,11 @@ "# 1. Create a boolean mask by using the .notna() method.\n", "# 2. Use the mask to subset the Series.\n", "\n", - "names = pd.Series(['foo','bar',None,'baz','qux',None])\n" + "names = pd.Series(['foo','bar',None,'baz','qux',None])\n", + "\n", + "mask=names.notna()\n", + "\n", + "names[mask]\n" ] }, { @@ -583,9 +956,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "foo\n", + "bar\n", + "None\n", + "foo\n", + "None\n", + "bar\n", + "bar\n", + "foo\n", + "None\n" + ] + }, + { + "data": { + "text/plain": [ + "0 foo\n", + "1 bar\n", + "2 None\n", + "3 foo\n", + "4 None\n", + "5 bar\n", + "6 bar\n", + "7 foo\n", + "8 None\n", + "dtype: object" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge: \n", "\n", @@ -595,26 +1003,57 @@ "\n", "\n", "def lower(s):\n", - " # Your code here\n", - " # HINT: delete the \"pass\" when your done\n", - " # HINT2: handle None values!\n", - " pass\n", + " \n", + " return s.map(lambda x:x.lower(), na_action='ignore')\n", + "\n", + "\n", + "# Easier to test\n", + "def lower(s):\n", + " try:\n", + " return s.lower()\n", + " except AttributeError:\n", + " return None\n", + " \n", + "\n", "\n", + "names = pd.Series(['Foo', 'BAR', None, 'foo', None, 'bar', 'bAR', 'foo', None])\n", "\n", - "names = pd.Series(['Foo', 'BAR', None, 'foo', None, 'bar', 'bAR', 'foo', None])" + "lower(names)\n", + "names.map(lower)\n", + "\n", + "for i in names:\n", + " print(lower(i))\n", + " \n", + "pd.Series([lower(n) for n in names])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.33333333])" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge: \n", "\n", "# Using the series from above, now lowercased, count the occurences of each name\n", "# Hint: It's simple, just use .value_counts()!\n", - "\n" + "\n", + "names.map(lower).value_counts(dropna=False)\n", + "\n", + "names.map(lower).isna().sum()/names.shape\n", + "\n", + "type(names.shape)\n" ] }, { @@ -636,9 +1075,163 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_billtipsexsmokerdaytimesize
016.991.01FemaleNoSunDinner2
110.341.66MaleNoSunDinner3
221.013.50MaleNoSunDinner3
323.683.31MaleNoSunDinner2
424.593.61FemaleNoSunDinner4
525.294.71MaleNoSunDinner4
68.772.00MaleNoSunDinner2
726.883.12MaleNoSunDinner4
815.041.96MaleNoSunDinner2
914.783.23MaleNoSunDinner2
\n", + "
" + ], + "text/plain": [ + " total_bill tip sex smoker day time size\n", + "0 16.99 1.01 Female No Sun Dinner 2\n", + "1 10.34 1.66 Male No Sun Dinner 3\n", + "2 21.01 3.50 Male No Sun Dinner 3\n", + "3 23.68 3.31 Male No Sun Dinner 2\n", + "4 24.59 3.61 Female No Sun Dinner 4\n", + "5 25.29 4.71 Male No Sun Dinner 4\n", + "6 8.77 2.00 Male No Sun Dinner 2\n", + "7 26.88 3.12 Male No Sun Dinner 4\n", + "8 15.04 1.96 Male No Sun Dinner 2\n", + "9 14.78 3.23 Male No Sun Dinner 2" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tips = pd.read_csv(\"tips.csv\")\n", "tips.head(10) # the first method of our dataframe object! " @@ -646,15 +1239,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 67, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# the other important attribute: name of rows and columns\n", "tips.index\n", "tips.columns" ] }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Female\n", + "1 Male\n", + "2 Male\n", + "3 Male\n", + "4 Female\n", + " ... \n", + "239 Male\n", + "240 Female\n", + "241 Male\n", + "242 Male\n", + "243 Female\n", + "Name: sex, Length: 244, dtype: object" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips.size\n", + "tips.sex" + ] + }, { "cell_type": "markdown", "metadata": { @@ -679,6 +1315,33 @@ "```" ] }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "foo 1\n", + "foo 2\n", + "dtype: int64" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips[\"size\"].corr(tips.tip)\n", + "\n", + "# Not unique indexes\n", + "s=pd.Series([1,2],index=['foo','foo'])\n", + "\n", + "s['foo']" + ] + }, { "cell_type": "markdown", "metadata": { @@ -733,9 +1396,60 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 70, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sexsmoker
1MaleNo
3MaleNo
\n", + "
" + ], + "text/plain": [ + " sex smoker\n", + "1 Male No\n", + "3 Male No" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Accessing rows AND columns!\n", "# Example of 2-dimension loc\n", @@ -745,27 +1459,309 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 82, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sexsmokerdaytimesize
0FemaleNoSunDinner2
3MaleNoSunDinner2
\n", + "
" + ], + "text/plain": [ + " sex smoker day time size\n", + "0 Female No Sun Dinner 2\n", + "3 Male No Sun Dinner 2" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Accessing rows AND columns!\n", "# Example of 2-dimensional iloc\n", "\n", - "tips.iloc[[1,3], 2:]" + "tips.iloc[[1,3], 2:]\n", + "tips.iloc[[0,3], 2:]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 83, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sextipday
20Male4.08Sat
21Female2.75Sat
22Female2.23Sat
23Male7.58Sat
24Male3.18Sat
25Male2.34Sat
26Male2.00Sat
27Male2.00Sat
28Male4.30Sat
29Female3.00Sat
30Male1.45Sat
31Male2.50Sat
32Female3.00Sat
33Female2.45Sat
34Male3.27Sat
35Male3.60Sat
36Male2.00Sat
37Female3.07Sat
38Male2.31Sat
39Male5.00Sat
40Male2.24Sat
41Male2.54Sun
42Male3.06Sun
43Male1.32Sun
44Male5.60Sun
\n", + "
" + ], + "text/plain": [ + " sex tip day\n", + "20 Male 4.08 Sat\n", + "21 Female 2.75 Sat\n", + "22 Female 2.23 Sat\n", + "23 Male 7.58 Sat\n", + "24 Male 3.18 Sat\n", + "25 Male 2.34 Sat\n", + "26 Male 2.00 Sat\n", + "27 Male 2.00 Sat\n", + "28 Male 4.30 Sat\n", + "29 Female 3.00 Sat\n", + "30 Male 1.45 Sat\n", + "31 Male 2.50 Sat\n", + "32 Female 3.00 Sat\n", + "33 Female 2.45 Sat\n", + "34 Male 3.27 Sat\n", + "35 Male 3.60 Sat\n", + "36 Male 2.00 Sat\n", + "37 Female 3.07 Sat\n", + "38 Male 2.31 Sat\n", + "39 Male 5.00 Sat\n", + "40 Male 2.24 Sat\n", + "41 Male 2.54 Sun\n", + "42 Male 3.06 Sun\n", + "43 Male 1.32 Sun\n", + "44 Male 5.60 Sun" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge:\n", "\n", "# Using the tips dataframe, create a new one that contains the \n", "# information contained in all rows between the 20th (inclusive) \n", - "# and the 45th (exclusive) and only the columns: tip, sex, day" + "# and the 45th (exclusive) and only the columns: tip, sex, day\n", + "\n", + "#t=tips[[\"sex\",\"tip\",\"day\"]]\n", + "\n", + "#t.iloc[20:45,:]\n", + "\n", + "tips.loc[20:44,['sex','tip','day']]" ] }, { @@ -785,6 +1781,193 @@ "etc\n" ] }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_billtipsexsmokerdaytimesize
016.991.01FemaleNoSunDinner2
110.341.66MaleNoSunDinner3
221.013.50FemaleNoSunDinner3
323.683.31fooNoSunDinner2
424.593.61FemaleNoSunDinner4
........................
23929.035.92fooNoSatDinner3
24027.182.00FemaleYesSatDinner2
24122.672.00MaleYesSatDinner2
24217.821.75MaleNoSatDinner2
24318.783.00FemaleNoThurDinner2
\n", + "

244 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " total_bill tip sex smoker day time size\n", + "0 16.99 1.01 Female No Sun Dinner 2\n", + "1 10.34 1.66 Male No Sun Dinner 3\n", + "2 21.01 3.50 Female No Sun Dinner 3\n", + "3 23.68 3.31 foo No Sun Dinner 2\n", + "4 24.59 3.61 Female No Sun Dinner 4\n", + ".. ... ... ... ... ... ... ...\n", + "239 29.03 5.92 foo No Sat Dinner 3\n", + "240 27.18 2.00 Female Yes Sat Dinner 2\n", + "241 22.67 2.00 Male Yes Sat Dinner 2\n", + "242 17.82 1.75 Male No Sat Dinner 2\n", + "243 18.78 3.00 Female No Thur Dinner 2\n", + "\n", + "[244 rows x 7 columns]" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips.loc[2]\n", + "tips.loc[2,[\"sex\",\"smoker\"]]\n", + "\n", + "# Set something in the dataframe\n", + "tips.loc[2,\"sex\"]=\"Female\"\n", + "\n", + "tips.loc[2,[\"sex\",\"smoker\"]]\n", + "\n", + "tips.loc[(tips.sex=='Male')&(tips.tip>2.),'sex']=\"foo\"\n", + "\n", + "tips" + ] + }, { "cell_type": "markdown", "metadata": { @@ -834,17 +2017,669 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 96, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_billtipsexsmokerdaytimesize
323.683.31fooNoSunDinner2
18323.176.50fooYesSunDinner4
10725.214.29fooYesSatDinner2
2613.372.00MaleNoSatDinner2
22516.272.50FemaleYesFriLunch2
525.294.71fooNoSunDinner4
1816.973.50FemaleNoSunDinner3
23024.012.00MaleYesSatDinner4
21125.895.16fooYesSatDinner4
13816.002.00MaleYesThurLunch2
5726.411.50FemaleNoSatDinner2
3417.783.27fooNoSatDinner2
17934.633.55fooYesSunDinner2
15529.855.14FemaleNoSunDinner5
1920.653.35fooNoSatDinner3
15324.552.00MaleNoSunDinner4
1489.781.73MaleNoThurLunch2
10011.352.50FemaleYesFriDinner2
23510.071.25MaleNoSatDinner2
016.991.01FemaleNoSunDinner2
2017.924.08fooNoSatDinner2
21628.153.00fooYesSatDinner5
13714.152.00FemaleNoThurLunch2
16620.762.24fooNoSunDinner2
12214.262.50fooNoThurLunch2
2215.772.23FemaleNoSatDinner2
2228.581.92MaleYesFriLunch1
3215.063.00FemaleNoSatDinner2
3716.933.07FemaleNoSatDinner3
21512.901.10FemaleYesSatDinner2
7117.073.00FemaleNoSatDinner3
4918.043.00fooNoSunDinner2
13120.272.83FemaleNoThurLunch2
221.013.50FemaleNoSunDinner3
439.681.32MaleNoSunDinner2
4828.552.05fooNoSunDinner3
23310.771.47MaleNoSatDinner2
1358.511.25FemaleNoThurLunch2
5519.493.51fooNoSunDinner2
1957.561.44MaleNoThurLunch2
10914.314.00FemaleYesSatDinner2
23929.035.92fooNoSatDinner3
11629.935.07fooNoSunDinner4
2919.653.00FemaleNoSatDinner2
9316.324.30FemaleYesFriDinner2
11323.952.55fooNoSunDinner2
6820.232.01fooNoSatDinner2
4213.943.06fooNoSunDinner2
7917.292.71fooNoThurLunch2
\n", + "
" + ], + "text/plain": [ + " total_bill tip sex smoker day time size\n", + "3 23.68 3.31 foo No Sun Dinner 2\n", + "183 23.17 6.50 foo Yes Sun Dinner 4\n", + "107 25.21 4.29 foo Yes Sat Dinner 2\n", + "26 13.37 2.00 Male No Sat Dinner 2\n", + "225 16.27 2.50 Female Yes Fri Lunch 2\n", + "5 25.29 4.71 foo No Sun Dinner 4\n", + "18 16.97 3.50 Female No Sun Dinner 3\n", + "230 24.01 2.00 Male Yes Sat Dinner 4\n", + "211 25.89 5.16 foo Yes Sat Dinner 4\n", + "138 16.00 2.00 Male Yes Thur Lunch 2\n", + "57 26.41 1.50 Female No Sat Dinner 2\n", + "34 17.78 3.27 foo No Sat Dinner 2\n", + "179 34.63 3.55 foo Yes Sun Dinner 2\n", + "155 29.85 5.14 Female No Sun Dinner 5\n", + "19 20.65 3.35 foo No Sat Dinner 3\n", + "153 24.55 2.00 Male No Sun Dinner 4\n", + "148 9.78 1.73 Male No Thur Lunch 2\n", + "100 11.35 2.50 Female Yes Fri Dinner 2\n", + "235 10.07 1.25 Male No Sat Dinner 2\n", + "0 16.99 1.01 Female No Sun Dinner 2\n", + "20 17.92 4.08 foo No Sat Dinner 2\n", + "216 28.15 3.00 foo Yes Sat Dinner 5\n", + "137 14.15 2.00 Female No Thur Lunch 2\n", + "166 20.76 2.24 foo No Sun Dinner 2\n", + "122 14.26 2.50 foo No Thur Lunch 2\n", + "22 15.77 2.23 Female No Sat Dinner 2\n", + "222 8.58 1.92 Male Yes Fri Lunch 1\n", + "32 15.06 3.00 Female No Sat Dinner 2\n", + "37 16.93 3.07 Female No Sat Dinner 3\n", + "215 12.90 1.10 Female Yes Sat Dinner 2\n", + "71 17.07 3.00 Female No Sat Dinner 3\n", + "49 18.04 3.00 foo No Sun Dinner 2\n", + "131 20.27 2.83 Female No Thur Lunch 2\n", + "2 21.01 3.50 Female No Sun Dinner 3\n", + "43 9.68 1.32 Male No Sun Dinner 2\n", + "48 28.55 2.05 foo No Sun Dinner 3\n", + "233 10.77 1.47 Male No Sat Dinner 2\n", + "135 8.51 1.25 Female No Thur Lunch 2\n", + "55 19.49 3.51 foo No Sun Dinner 2\n", + "195 7.56 1.44 Male No Thur Lunch 2\n", + "109 14.31 4.00 Female Yes Sat Dinner 2\n", + "239 29.03 5.92 foo No Sat Dinner 3\n", + "116 29.93 5.07 foo No Sun Dinner 4\n", + "29 19.65 3.00 Female No Sat Dinner 2\n", + "93 16.32 4.30 Female Yes Fri Dinner 2\n", + "113 23.95 2.55 foo No Sun Dinner 2\n", + "68 20.23 2.01 foo No Sat Dinner 2\n", + "42 13.94 3.06 foo No Sun Dinner 2\n", + "79 17.29 2.71 foo No Thur Lunch 2" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips.sample(frac=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tipsize
tip1.0000000.387542
size0.3875421.000000
\n", + "
" + ], + "text/plain": [ + " tip size\n", + "tip 1.000000 0.387542\n", + "size 0.387542 1.000000" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge:\n", "\n", "# Using the tips dataframe, calculate the correlation between\n", "# tip and size for only Male clients during Dinner. \n", "\n", + "#tips[(tips.time==\"Dinner\") &(tips.sex==\"Male\")][[\"tip\",\"size\"]].corr()\n", + "\n", "# HINT: Remember that \"size\" cannot be accessed via dot notation, as it's an \n", - "# attribute of the series!" + "# attribute of the series!\n", + "\n", + "#tips.head()\n", + "# Split code in multiple lines and put it all in paranthesis\n", + "(tips[(tips.time==\"Dinner\") &(tips.sex==\"Male\")]\n", + " [[\"tip\",\"size\"]]\n", + " .corr()\n", + ")" ] }, { @@ -864,21 +2699,149 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 111, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Group tips dataframe by size of table\n", "by_size = tips.groupby(\"size\")\n", "\n", - "by_size" + "by_size\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 98, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[(1, total_bill tip sex smoker day time size\n", + " 67 3.07 1.00 Female Yes Sat Dinner 1\n", + " 82 10.07 1.83 Female No Thur Lunch 1\n", + " 111 7.25 1.00 Female No Sat Dinner 1\n", + " 222 8.58 1.92 Male Yes Fri Lunch 1),\n", + " (2, total_bill tip sex smoker day time size\n", + " 0 16.99 1.01 Female No Sun Dinner 2\n", + " 3 23.68 3.31 foo No Sun Dinner 2\n", + " 6 8.77 2.00 Male No Sun Dinner 2\n", + " 8 15.04 1.96 Male No Sun Dinner 2\n", + " 9 14.78 3.23 foo No Sun Dinner 2\n", + " .. ... ... ... ... ... ... ...\n", + " 237 32.83 1.17 Male Yes Sat Dinner 2\n", + " 240 27.18 2.00 Female Yes Sat Dinner 2\n", + " 241 22.67 2.00 Male Yes Sat Dinner 2\n", + " 242 17.82 1.75 Male No Sat Dinner 2\n", + " 243 18.78 3.00 Female No Thur Dinner 2\n", + " \n", + " [156 rows x 7 columns]),\n", + " (3, total_bill tip sex smoker day time size\n", + " 1 10.34 1.66 Male No Sun Dinner 3\n", + " 2 21.01 3.50 Female No Sun Dinner 3\n", + " 16 10.33 1.67 Female No Sun Dinner 3\n", + " 17 16.29 3.71 foo No Sun Dinner 3\n", + " 18 16.97 3.50 Female No Sun Dinner 3\n", + " 19 20.65 3.35 foo No Sat Dinner 3\n", + " 35 24.06 3.60 foo No Sat Dinner 3\n", + " 36 16.31 2.00 Male No Sat Dinner 3\n", + " 37 16.93 3.07 Female No Sat Dinner 3\n", + " 38 18.69 2.31 foo No Sat Dinner 3\n", + " 39 31.27 5.00 foo No Sat Dinner 3\n", + " 40 16.04 2.24 foo No Sat Dinner 3\n", + " 48 28.55 2.05 foo No Sun Dinner 3\n", + " 64 17.59 2.64 foo No Sat Dinner 3\n", + " 65 20.08 3.15 foo No Sat Dinner 3\n", + " 71 17.07 3.00 Female No Sat Dinner 3\n", + " 102 44.30 2.50 Female Yes Sat Dinner 3\n", + " 112 38.07 4.00 foo No Sun Dinner 3\n", + " 114 25.71 4.00 Female No Sun Dinner 3\n", + " 129 22.82 2.18 foo No Thur Lunch 3\n", + " 146 18.64 1.36 Female No Thur Lunch 3\n", + " 152 17.26 2.74 foo No Sun Dinner 3\n", + " 162 16.21 2.00 Female No Sun Dinner 3\n", + " 165 24.52 3.48 foo No Sun Dinner 3\n", + " 170 50.81 10.00 foo Yes Sat Dinner 3\n", + " 182 45.35 3.50 foo Yes Sun Dinner 3\n", + " 186 20.90 3.50 Female Yes Sun Dinner 3\n", + " 188 18.15 3.50 Female Yes Sun Dinner 3\n", + " 189 23.10 4.00 foo Yes Sun Dinner 3\n", + " 200 18.71 4.00 foo Yes Thur Lunch 3\n", + " 205 16.47 3.23 Female Yes Thur Lunch 3\n", + " 206 26.59 3.41 foo Yes Sat Dinner 3\n", + " 210 30.06 2.00 Male Yes Sat Dinner 3\n", + " 214 28.17 6.50 Female Yes Sat Dinner 3\n", + " 223 15.98 3.00 Female No Fri Lunch 3\n", + " 231 15.69 3.00 foo Yes Sat Dinner 3\n", + " 238 35.83 4.67 Female No Sat Dinner 3\n", + " 239 29.03 5.92 foo No Sat Dinner 3),\n", + " (4, total_bill tip sex smoker day time size\n", + " 4 24.59 3.61 Female No Sun Dinner 4\n", + " 5 25.29 4.71 foo No Sun Dinner 4\n", + " 7 26.88 3.12 foo No Sun Dinner 4\n", + " 11 35.26 5.00 Female No Sun Dinner 4\n", + " 13 18.43 3.00 foo No Sun Dinner 4\n", + " 23 39.42 7.58 foo No Sat Dinner 4\n", + " 25 17.81 2.34 foo No Sat Dinner 4\n", + " 31 18.35 2.50 foo No Sat Dinner 4\n", + " 33 20.69 2.45 Female No Sat Dinner 4\n", + " 44 30.40 5.60 foo No Sun Dinner 4\n", + " 47 32.40 6.00 foo No Sun Dinner 4\n", + " 52 34.81 5.20 Female No Sun Dinner 4\n", + " 54 25.56 4.34 foo No Sun Dinner 4\n", + " 56 38.01 3.00 foo Yes Sat Dinner 4\n", + " 59 48.27 6.73 foo No Sat Dinner 4\n", + " 63 18.29 3.76 foo Yes Sat Dinner 4\n", + " 77 27.20 4.00 foo No Thur Lunch 4\n", + " 85 34.83 5.17 Female No Thur Lunch 4\n", + " 95 40.17 4.73 foo Yes Fri Dinner 4\n", + " 116 29.93 5.07 foo No Sun Dinner 4\n", + " 119 24.08 2.92 Female No Thur Lunch 4\n", + " 153 24.55 2.00 Male No Sun Dinner 4\n", + " 154 19.77 2.00 Male No Sun Dinner 4\n", + " 157 25.00 3.75 Female No Sun Dinner 4\n", + " 159 16.49 2.00 Male No Sun Dinner 4\n", + " 160 21.50 3.50 foo No Sun Dinner 4\n", + " 167 31.71 4.50 foo No Sun Dinner 4\n", + " 180 34.65 3.68 foo Yes Sun Dinner 4\n", + " 183 23.17 6.50 foo Yes Sun Dinner 4\n", + " 197 43.11 5.00 Female Yes Thur Lunch 4\n", + " 204 20.53 4.00 foo Yes Thur Lunch 4\n", + " 207 38.73 3.00 foo Yes Sat Dinner 4\n", + " 211 25.89 5.16 foo Yes Sat Dinner 4\n", + " 212 48.33 9.00 foo No Sat Dinner 4\n", + " 219 30.14 3.09 Female Yes Sat Dinner 4\n", + " 227 20.45 3.00 foo No Sat Dinner 4\n", + " 230 24.01 2.00 Male Yes Sat Dinner 4),\n", + " (5, total_bill tip sex smoker day time size\n", + " 142 41.19 5.00 foo No Thur Lunch 5\n", + " 155 29.85 5.14 Female No Sun Dinner 5\n", + " 185 20.69 5.00 foo No Sun Dinner 5\n", + " 187 30.46 2.00 Male Yes Sun Dinner 5\n", + " 216 28.15 3.00 foo Yes Sat Dinner 5),\n", + " (6, total_bill tip sex smoker day time size\n", + " 125 29.80 4.2 Female No Thur Lunch 6\n", + " 141 34.30 6.7 foo No Thur Lunch 6\n", + " 143 27.05 5.0 Female No Thur Lunch 6\n", + " 156 48.17 5.0 foo No Sun Dinner 6)]" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# If we coerce it to a list, we see something interesting: \n", "# It's basically a list of tuples! \n", @@ -890,9 +2853,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 99, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Female\n", + "total_bill 18.090455\n", + "tip 2.841023\n", + "size 2.465909\n", + "dtype: float64\n", + "Male\n", + "total_bill 14.3908\n", + "tip 1.7512\n", + "size 2.2600\n", + "dtype: float64\n", + "foo\n", + "total_bill 23.738396\n", + "tip 3.717075\n", + "size 2.801887\n", + "dtype: float64\n" + ] + } + ], "source": [ "# We can iterate through the groupby just like we would a list of tuples!\n", "\n", @@ -921,9 +2906,26 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 100, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "sex\n", + "Female 44.30\n", + "Male 32.83\n", + "foo 50.81\n", + "dtype: float64" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get the maximum bill by gender: \n", "\n", @@ -935,14 +2937,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 105, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "9.0" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips.tip.sort_values(ascending=False).iloc[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sex\n", + "Female 43.11\n", + "Male 30.46\n", + "foo 48.33\n", + "dtype: float64" + ] + }, + "execution_count": 132, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge: \n", "\n", + "\n", + "from toolz import curry\n", "# Get the second largest bill by gender!\n", - "# HINT: use sort_values and iloc!" + "# HINT: use sort_values and iloc!\n", + "\n", + "def max_nbill(df,n):\n", + " return df.total_bill.sort_values(ascending=False).iloc[n-1]\n", + "\n", + "tips.groupby(\"sex\").apply(lambda df: max_nbill(df,n=2))\n", + "\n", + "#def partial (fn,*args):\n", + "# return lambda x: fn(x, *args)\n", + "\n", + "#tips.groupby('sex').apply(partial(max_nbill,n=3))\n", + "\n", + "from functools import partial\n" ] }, { @@ -961,9 +3012,180 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 135, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_billtipsexsmokerdaytimesize
sex
Female44.306.5FemaleYesThurLunch6
Male32.832.0MaleYesThurLunch5
foo50.8110.0fooYesThurLunch6
\n", + "
" + ], + "text/plain": [ + " total_bill tip sex smoker day time size\n", + "sex \n", + "Female 44.30 6.5 Female Yes Thur Lunch 6\n", + "Male 32.83 2.0 Male Yes Thur Lunch 5\n", + "foo 50.81 10.0 foo Yes Thur Lunch 6" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips.groupby(\"sex\").max()\n", + "tips.groupby(\"sex\").apply(lambda c: c.max())" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dayFriSatSunThur
sex
Female2.7811112.8017863.3742112.575625
Male1.6250001.6815001.8300001.815000
foo3.4050003.8030773.7560983.563000
\n", + "
" + ], + "text/plain": [ + "day Fri Sat Sun Thur\n", + "sex \n", + "Female 2.781111 2.801786 3.374211 2.575625\n", + "Male 1.625000 1.681500 1.830000 1.815000\n", + "foo 3.405000 3.803077 3.756098 3.563000" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Challenge: \n", "# What is the mean tip, per day, for male vs. female?\n", @@ -972,7 +3194,7 @@ "def day_mean(df):\n", " # Hint: you will need to group by \"day\"\n", " # in this function, then get the mean tip. \n", - " pass\n", + " return df.groupby(\"day\").tip.mean()\n", "\n", "\n", "tips.groupby(\"sex\").apply(day_mean)" @@ -998,9 +3220,434 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 138, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
daysextotal_billtipsize
0FriFemale14.1455562.7811112.111111
1FriMale11.6225001.6250001.750000
2Frifoo25.3466673.4050002.333333
3SatFemale19.6803572.8017862.250000
4SatMale15.3020001.6815002.200000
5Satfoo23.6233333.8030772.871795
6SunFemale19.9321053.3742112.947368
7SunMale15.3581251.8300002.625000
8Sunfoo24.4565853.7560982.878049
9ThurFemale16.7153122.5756252.468750
10ThurMale12.1280001.8150002.000000
11Thurfoo22.0080003.5630002.650000
\n", + "
" + ], + "text/plain": [ + " day sex total_bill tip size\n", + "0 Fri Female 14.145556 2.781111 2.111111\n", + "1 Fri Male 11.622500 1.625000 1.750000\n", + "2 Fri foo 25.346667 3.405000 2.333333\n", + "3 Sat Female 19.680357 2.801786 2.250000\n", + "4 Sat Male 15.302000 1.681500 2.200000\n", + "5 Sat foo 23.623333 3.803077 2.871795\n", + "6 Sun Female 19.932105 3.374211 2.947368\n", + "7 Sun Male 15.358125 1.830000 2.625000\n", + "8 Sun foo 24.456585 3.756098 2.878049\n", + "9 Thur Female 16.715312 2.575625 2.468750\n", + "10 Thur Male 12.128000 1.815000 2.000000\n", + "11 Thur foo 22.008000 3.563000 2.650000" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tips.groupby(['day','sex']).mean().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(('Female', 'Fri'), total_bill tip sex smoker day time size\n", + " 92 5.75 1.00 Female Yes Fri Dinner 2\n", + " 93 16.32 4.30 Female Yes Fri Dinner 2\n", + " 94 22.75 3.25 Female No Fri Dinner 2\n", + " 100 11.35 2.50 Female Yes Fri Dinner 2\n", + " 101 15.38 3.00 Female Yes Fri Dinner 2\n", + " 221 13.42 3.48 Female Yes Fri Lunch 2\n", + " 223 15.98 3.00 Female No Fri Lunch 3\n", + " 225 16.27 2.50 Female Yes Fri Lunch 2\n", + " 226 10.09 2.00 Female Yes Fri Lunch 2),\n", + " (('Female', 'Sat'), total_bill tip sex smoker day time size\n", + " 21 20.29 2.75 Female No Sat Dinner 2\n", + " 22 15.77 2.23 Female No Sat Dinner 2\n", + " 29 19.65 3.00 Female No Sat Dinner 2\n", + " 32 15.06 3.00 Female No Sat Dinner 2\n", + " 33 20.69 2.45 Female No Sat Dinner 4\n", + " 37 16.93 3.07 Female No Sat Dinner 3\n", + " 57 26.41 1.50 Female No Sat Dinner 2\n", + " 66 16.45 2.47 Female No Sat Dinner 2\n", + " 67 3.07 1.00 Female Yes Sat Dinner 1\n", + " 71 17.07 3.00 Female No Sat Dinner 3\n", + " 72 26.86 3.14 Female Yes Sat Dinner 2\n", + " 73 25.28 5.00 Female Yes Sat Dinner 2\n", + " 74 14.73 2.20 Female No Sat Dinner 2\n", + " 102 44.30 2.50 Female Yes Sat Dinner 3\n", + " 103 22.42 3.48 Female Yes Sat Dinner 2\n", + " 104 20.92 4.08 Female No Sat Dinner 2\n", + " 109 14.31 4.00 Female Yes Sat Dinner 2\n", + " 111 7.25 1.00 Female No Sat Dinner 1\n", + " 168 10.59 1.61 Female Yes Sat Dinner 2\n", + " 169 10.63 2.00 Female Yes Sat Dinner 2\n", + " 209 12.76 2.23 Female Yes Sat Dinner 2\n", + " 213 13.27 2.50 Female Yes Sat Dinner 2\n", + " 214 28.17 6.50 Female Yes Sat Dinner 3\n", + " 215 12.90 1.10 Female Yes Sat Dinner 2\n", + " 219 30.14 3.09 Female Yes Sat Dinner 4\n", + " 229 22.12 2.88 Female Yes Sat Dinner 2\n", + " 238 35.83 4.67 Female No Sat Dinner 3\n", + " 240 27.18 2.00 Female Yes Sat Dinner 2),\n", + " (('Female', 'Sun'), total_bill tip sex smoker day time size\n", + " 0 16.99 1.01 Female No Sun Dinner 2\n", + " 2 21.01 3.50 Female No Sun Dinner 3\n", + " 4 24.59 3.61 Female No Sun Dinner 4\n", + " 11 35.26 5.00 Female No Sun Dinner 4\n", + " 14 14.83 3.02 Female No Sun Dinner 2\n", + " 16 10.33 1.67 Female No Sun Dinner 3\n", + " 18 16.97 3.50 Female No Sun Dinner 3\n", + " 51 10.29 2.60 Female No Sun Dinner 2\n", + " 52 34.81 5.20 Female No Sun Dinner 4\n", + " 114 25.71 4.00 Female No Sun Dinner 3\n", + " 115 17.31 3.50 Female No Sun Dinner 2\n", + " 155 29.85 5.14 Female No Sun Dinner 5\n", + " 157 25.00 3.75 Female No Sun Dinner 4\n", + " 158 13.39 2.61 Female No Sun Dinner 2\n", + " 162 16.21 2.00 Female No Sun Dinner 3\n", + " 164 17.51 3.00 Female Yes Sun Dinner 2\n", + " 178 9.60 4.00 Female Yes Sun Dinner 2\n", + " 186 20.90 3.50 Female Yes Sun Dinner 3\n", + " 188 18.15 3.50 Female Yes Sun Dinner 3),\n", + " (('Female', 'Thur'), total_bill tip sex smoker day time size\n", + " 82 10.07 1.83 Female No Thur Lunch 1\n", + " 85 34.83 5.17 Female No Thur Lunch 4\n", + " 117 10.65 1.50 Female No Thur Lunch 2\n", + " 118 12.43 1.80 Female No Thur Lunch 2\n", + " 119 24.08 2.92 Female No Thur Lunch 4\n", + " 121 13.42 1.68 Female No Thur Lunch 2\n", + " 124 12.48 2.52 Female No Thur Lunch 2\n", + " 125 29.80 4.20 Female No Thur Lunch 6\n", + " 127 14.52 2.00 Female No Thur Lunch 2\n", + " 128 11.38 2.00 Female No Thur Lunch 2\n", + " 131 20.27 2.83 Female No Thur Lunch 2\n", + " 132 11.17 1.50 Female No Thur Lunch 2\n", + " 133 12.26 2.00 Female No Thur Lunch 2\n", + " 134 18.26 3.25 Female No Thur Lunch 2\n", + " 135 8.51 1.25 Female No Thur Lunch 2\n", + " 136 10.33 2.00 Female No Thur Lunch 2\n", + " 137 14.15 2.00 Female No Thur Lunch 2\n", + " 139 13.16 2.75 Female No Thur Lunch 2\n", + " 140 17.47 3.50 Female No Thur Lunch 2\n", + " 143 27.05 5.00 Female No Thur Lunch 6\n", + " 144 16.43 2.30 Female No Thur Lunch 2\n", + " 145 8.35 1.50 Female No Thur Lunch 2\n", + " 146 18.64 1.36 Female No Thur Lunch 3\n", + " 147 11.87 1.63 Female No Thur Lunch 2\n", + " 191 19.81 4.19 Female Yes Thur Lunch 2\n", + " 197 43.11 5.00 Female Yes Thur Lunch 4\n", + " 198 13.00 2.00 Female Yes Thur Lunch 2\n", + " 201 12.74 2.01 Female Yes Thur Lunch 2\n", + " 202 13.00 2.00 Female Yes Thur Lunch 2\n", + " 203 16.40 2.50 Female Yes Thur Lunch 2\n", + " 205 16.47 3.23 Female Yes Thur Lunch 3\n", + " 243 18.78 3.00 Female No Thur Dinner 2),\n", + " (('Male', 'Fri'), total_bill tip sex smoker day time size\n", + " 97 12.03 1.50 Male Yes Fri Dinner 2\n", + " 99 12.46 1.50 Male No Fri Dinner 2\n", + " 222 8.58 1.92 Male Yes Fri Lunch 1\n", + " 224 13.42 1.58 Male Yes Fri Lunch 2),\n", + " (('Male', 'Sat'), total_bill tip sex smoker day time size\n", + " 26 13.37 2.00 Male No Sat Dinner 2\n", + " 27 12.69 2.00 Male No Sat Dinner 2\n", + " 30 9.55 1.45 Male No Sat Dinner 2\n", + " 36 16.31 2.00 Male No Sat Dinner 3\n", + " 58 11.24 1.76 Male Yes Sat Dinner 2\n", + " 61 13.81 2.00 Male Yes Sat Dinner 2\n", + " 62 11.02 1.98 Male Yes Sat Dinner 2\n", + " 70 12.02 1.97 Male No Sat Dinner 2\n", + " 75 10.51 1.25 Male No Sat Dinner 2\n", + " 105 15.36 1.64 Male Yes Sat Dinner 2\n", + " 210 30.06 2.00 Male Yes Sat Dinner 3\n", + " 217 11.59 1.50 Male Yes Sat Dinner 2\n", + " 218 7.74 1.44 Male Yes Sat Dinner 2\n", + " 230 24.01 2.00 Male Yes Sat Dinner 4\n", + " 233 10.77 1.47 Male No Sat Dinner 2\n", + " 235 10.07 1.25 Male No Sat Dinner 2\n", + " 236 12.60 1.00 Male Yes Sat Dinner 2\n", + " 237 32.83 1.17 Male Yes Sat Dinner 2\n", + " 241 22.67 2.00 Male Yes Sat Dinner 2\n", + " 242 17.82 1.75 Male No Sat Dinner 2),\n", + " (('Male', 'Sun'), total_bill tip sex smoker day time size\n", + " 1 10.34 1.66 Male No Sun Dinner 3\n", + " 6 8.77 2.00 Male No Sun Dinner 2\n", + " 8 15.04 1.96 Male No Sun Dinner 2\n", + " 10 10.27 1.71 Male No Sun Dinner 2\n", + " 12 15.42 1.57 Male No Sun Dinner 2\n", + " 43 9.68 1.32 Male No Sun Dinner 2\n", + " 53 9.94 1.56 Male No Sun Dinner 2\n", + " 151 13.13 2.00 Male No Sun Dinner 2\n", + " 153 24.55 2.00 Male No Sun Dinner 4\n", + " 154 19.77 2.00 Male No Sun Dinner 4\n", + " 159 16.49 2.00 Male No Sun Dinner 4\n", + " 163 13.81 2.00 Male No Sun Dinner 2\n", + " 176 17.89 2.00 Male Yes Sun Dinner 2\n", + " 177 14.48 2.00 Male Yes Sun Dinner 2\n", + " 187 30.46 2.00 Male Yes Sun Dinner 5\n", + " 190 15.69 1.50 Male Yes Sun Dinner 2),\n", + " (('Male', 'Thur'), total_bill tip sex smoker day time size\n", + " 86 13.03 2.00 Male No Thur Lunch 2\n", + " 123 15.95 2.00 Male No Thur Lunch 2\n", + " 126 8.52 1.48 Male No Thur Lunch 2\n", + " 130 19.08 1.50 Male No Thur Lunch 2\n", + " 138 16.00 2.00 Male Yes Thur Lunch 2\n", + " 148 9.78 1.73 Male No Thur Lunch 2\n", + " 149 7.51 2.00 Male No Thur Lunch 2\n", + " 195 7.56 1.44 Male No Thur Lunch 2\n", + " 196 10.34 2.00 Male Yes Thur Lunch 2\n", + " 199 13.51 2.00 Male Yes Thur Lunch 2),\n", + " (('foo', 'Fri'), total_bill tip sex smoker day time size\n", + " 90 28.97 3.00 foo Yes Fri Dinner 2\n", + " 91 22.49 3.50 foo No Fri Dinner 2\n", + " 95 40.17 4.73 foo Yes Fri Dinner 4\n", + " 96 27.28 4.00 foo Yes Fri Dinner 2\n", + " 98 21.01 3.00 foo Yes Fri Dinner 2\n", + " 220 12.16 2.20 foo Yes Fri Lunch 2),\n", + " (('foo', 'Sat'), total_bill tip sex smoker day time size\n", + " 19 20.65 3.35 foo No Sat Dinner 3\n", + " 20 17.92 4.08 foo No Sat Dinner 2\n", + " 23 39.42 7.58 foo No Sat Dinner 4\n", + " 24 19.82 3.18 foo No Sat Dinner 2\n", + " 25 17.81 2.34 foo No Sat Dinner 4\n", + " 28 21.70 4.30 foo No Sat Dinner 2\n", + " 31 18.35 2.50 foo No Sat Dinner 4\n", + " 34 17.78 3.27 foo No Sat Dinner 2\n", + " 35 24.06 3.60 foo No Sat Dinner 3\n", + " 38 18.69 2.31 foo No Sat Dinner 3\n", + " 39 31.27 5.00 foo No Sat Dinner 3\n", + " 40 16.04 2.24 foo No Sat Dinner 3\n", + " 56 38.01 3.00 foo Yes Sat Dinner 4\n", + " 59 48.27 6.73 foo No Sat Dinner 4\n", + " 60 20.29 3.21 foo Yes Sat Dinner 2\n", + " 63 18.29 3.76 foo Yes Sat Dinner 4\n", + " 64 17.59 2.64 foo No Sat Dinner 3\n", + " 65 20.08 3.15 foo No Sat Dinner 3\n", + " 68 20.23 2.01 foo No Sat Dinner 2\n", + " 69 15.01 2.09 foo Yes Sat Dinner 2\n", + " 76 17.92 3.08 foo Yes Sat Dinner 2\n", + " 106 20.49 4.06 foo Yes Sat Dinner 2\n", + " 107 25.21 4.29 foo Yes Sat Dinner 2\n", + " 108 18.24 3.76 foo No Sat Dinner 2\n", + " 110 14.00 3.00 foo No Sat Dinner 2\n", + " 170 50.81 10.00 foo Yes Sat Dinner 3\n", + " 171 15.81 3.16 foo Yes Sat Dinner 2\n", + " 206 26.59 3.41 foo Yes Sat Dinner 3\n", + " 207 38.73 3.00 foo Yes Sat Dinner 4\n", + " 208 24.27 2.03 foo Yes Sat Dinner 2\n", + " 211 25.89 5.16 foo Yes Sat Dinner 4\n", + " 212 48.33 9.00 foo No Sat Dinner 4\n", + " 216 28.15 3.00 foo Yes Sat Dinner 5\n", + " 227 20.45 3.00 foo No Sat Dinner 4\n", + " 228 13.28 2.72 foo No Sat Dinner 2\n", + " 231 15.69 3.00 foo Yes Sat Dinner 3\n", + " 232 11.61 3.39 foo No Sat Dinner 2\n", + " 234 15.53 3.00 foo Yes Sat Dinner 2\n", + " 239 29.03 5.92 foo No Sat Dinner 3),\n", + " (('foo', 'Sun'), total_bill tip sex smoker day time size\n", + " 3 23.68 3.31 foo No Sun Dinner 2\n", + " 5 25.29 4.71 foo No Sun Dinner 4\n", + " 7 26.88 3.12 foo No Sun Dinner 4\n", + " 9 14.78 3.23 foo No Sun Dinner 2\n", + " 13 18.43 3.00 foo No Sun Dinner 4\n", + " 15 21.58 3.92 foo No Sun Dinner 2\n", + " 17 16.29 3.71 foo No Sun Dinner 3\n", + " 41 17.46 2.54 foo No Sun Dinner 2\n", + " 42 13.94 3.06 foo No Sun Dinner 2\n", + " 44 30.40 5.60 foo No Sun Dinner 4\n", + " 45 18.29 3.00 foo No Sun Dinner 2\n", + " 46 22.23 5.00 foo No Sun Dinner 2\n", + " 47 32.40 6.00 foo No Sun Dinner 4\n", + " 48 28.55 2.05 foo No Sun Dinner 3\n", + " 49 18.04 3.00 foo No Sun Dinner 2\n", + " 50 12.54 2.50 foo No Sun Dinner 2\n", + " 54 25.56 4.34 foo No Sun Dinner 4\n", + " 55 19.49 3.51 foo No Sun Dinner 2\n", + " 112 38.07 4.00 foo No Sun Dinner 3\n", + " 113 23.95 2.55 foo No Sun Dinner 2\n", + " 116 29.93 5.07 foo No Sun Dinner 4\n", + " 150 14.07 2.50 foo No Sun Dinner 2\n", + " 152 17.26 2.74 foo No Sun Dinner 3\n", + " 156 48.17 5.00 foo No Sun Dinner 6\n", + " 160 21.50 3.50 foo No Sun Dinner 4\n", + " 161 12.66 2.50 foo No Sun Dinner 2\n", + " 165 24.52 3.48 foo No Sun Dinner 3\n", + " 166 20.76 2.24 foo No Sun Dinner 2\n", + " 167 31.71 4.50 foo No Sun Dinner 4\n", + " 172 7.25 5.15 foo Yes Sun Dinner 2\n", + " 173 31.85 3.18 foo Yes Sun Dinner 2\n", + " 174 16.82 4.00 foo Yes Sun Dinner 2\n", + " 175 32.90 3.11 foo Yes Sun Dinner 2\n", + " 179 34.63 3.55 foo Yes Sun Dinner 2\n", + " 180 34.65 3.68 foo Yes Sun Dinner 4\n", + " 181 23.33 5.65 foo Yes Sun Dinner 2\n", + " 182 45.35 3.50 foo Yes Sun Dinner 3\n", + " 183 23.17 6.50 foo Yes Sun Dinner 4\n", + " 184 40.55 3.00 foo Yes Sun Dinner 2\n", + " 185 20.69 5.00 foo No Sun Dinner 5\n", + " 189 23.10 4.00 foo Yes Sun Dinner 3),\n", + " (('foo', 'Thur'), total_bill tip sex smoker day time size\n", + " 77 27.20 4.00 foo No Thur Lunch 4\n", + " 78 22.76 3.00 foo No Thur Lunch 2\n", + " 79 17.29 2.71 foo No Thur Lunch 2\n", + " 80 19.44 3.00 foo Yes Thur Lunch 2\n", + " 81 16.66 3.40 foo No Thur Lunch 2\n", + " 83 32.68 5.00 foo Yes Thur Lunch 2\n", + " 84 15.98 2.03 foo No Thur Lunch 2\n", + " 87 18.28 4.00 foo No Thur Lunch 2\n", + " 88 24.71 5.85 foo No Thur Lunch 2\n", + " 89 21.16 3.00 foo No Thur Lunch 2\n", + " 120 11.69 2.31 foo No Thur Lunch 2\n", + " 122 14.26 2.50 foo No Thur Lunch 2\n", + " 129 22.82 2.18 foo No Thur Lunch 3\n", + " 141 34.30 6.70 foo No Thur Lunch 6\n", + " 142 41.19 5.00 foo No Thur Lunch 5\n", + " 192 28.44 2.56 foo Yes Thur Lunch 2\n", + " 193 15.48 2.02 foo Yes Thur Lunch 2\n", + " 194 16.58 4.00 foo Yes Thur Lunch 2\n", + " 200 18.71 4.00 foo Yes Thur Lunch 3\n", + " 204 20.53 4.00 foo Yes Thur Lunch 4)]" + ] + }, + "execution_count": 137, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Take a look at the structure of the multiple groupby!\n", "\n", @@ -1073,6 +3720,212 @@ "\n" ] }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A 1.0\n", + "B 4.0\n", + "C NaN\n", + "Name: 0, dtype: float64\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " This is separate from the ipykernel package so we can avoid doing imports until\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AB
014
125
236
\n", + "
" + ], + "text/plain": [ + " A B\n", + "0 1 4\n", + "1 2 5\n", + "2 3 6" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AC
047
\n", + "
" + ], + "text/plain": [ + " A C\n", + "0 4 7" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
014.0NaN
125.0NaN
236.0NaN
34NaN7.0
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 1 4.0 NaN\n", + "1 2 5.0 NaN\n", + "2 3 6.0 NaN\n", + "3 4 NaN 7.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df1 = pd.DataFrame({\"A\": pd.Series([1,2,3]), \"B\": pd.Series([4,5,6])})\n", + "df2 = pd.DataFrame({\"A\": pd.Series([4]), \"C\": pd.Series([7])})\n", + "df = pd.concat([df1,df2]).reset_index(drop=True)\n", + "df = pd.concat([df1,df2],ignore_index=True,axis=0,sort=False)\n", + "\n", + "\n", + "print(df.loc[0])\n", + "\n", + "display(df1,df2,df)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -1108,6 +3961,30 @@ "and what will happen if \"how\" changes to each of the other options?" ] }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " A B C\n", + "0 1 4.0 NaN\n", + "1 2 5.0 NaN\n", + "2 3 6.0 NaN\n", + "3 4 NaN 7.0\n" + ] + } + ], + "source": [ + "df1 = pd.DataFrame({\"A\": pd.Series([1,2,3]), \"B\": pd.Series([4,5,6])})\n", + "df2 = pd.DataFrame({\"A\": pd.Series([4]), \"C\": pd.Series([7])})\n", + "df = pd.merge(df1,df2,on = \"A\", how = \"outer\")\n", + "print(df)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1133,9 +4010,70 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 200, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
screennameid_strtexthashtags
0nandanrao928374987Woah, pandas is so much fun #worldrocked #jawd...[worldrocked, jawdrop, ml]
1om98214039I eat linear models for breakfast #datascience...[datascience, ml, crossfit]
\n", + "
" + ], + "text/plain": [ + " screenname id_str text \\\n", + "0 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd... \n", + "1 om 98214039 I eat linear models for breakfast #datascience... \n", + "\n", + " hashtags \n", + "0 [worldrocked, jawdrop, ml] \n", + "1 [datascience, ml, crossfit] " + ] + }, + "execution_count": 200, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "raw_tweets = [{ \"screenname\": \"nandanrao\",\n", " \"id_str\": \"928374987\",\n", @@ -1165,9 +4103,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 201, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
screennameid_strtext
0nandanrao928374987Woah, pandas is so much fun #worldrocked #jawd...
1om98214039I eat linear models for breakfast #datascience...
\n", + "
" + ], + "text/plain": [ + " screenname id_str text\n", + "0 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd...\n", + "1 om 98214039 I eat linear models for breakfast #datascience..." + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tweets = pd.DataFrame(raw_tweets, columns = [\"screenname\", \"id_str\", \"text\"])\n", "tweets" @@ -1175,10 +4167,87 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 202, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_strhashtag
0928374987worldrocked
1928374987jawdrop
2928374987ml
398214039datascience
498214039ml
598214039crossfit
\n", + "
" + ], + "text/plain": [ + " id_str hashtag\n", + "0 928374987 worldrocked\n", + "1 928374987 jawdrop\n", + "2 928374987 ml\n", + "3 98214039 datascience\n", + "4 98214039 ml\n", + "5 98214039 crossfit" + ] + }, + "execution_count": 202, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "# REALLY COOL IDEA!!\n", + "\n", "tags_and_ids = [(t['id_str'], tag) \n", " for t in raw_tweets \n", " for tag in t['hashtags']]\n", @@ -1190,15 +4259,139 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 203, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
screennameid_strtexthashtag
0nandanrao928374987Woah, pandas is so much fun #worldrocked #jawd...worldrocked
1nandanrao928374987Woah, pandas is so much fun #worldrocked #jawd...jawdrop
2nandanrao928374987Woah, pandas is so much fun #worldrocked #jawd...ml
3om98214039I eat linear models for breakfast #datascience...datascience
4om98214039I eat linear models for breakfast #datascience...ml
5om98214039I eat linear models for breakfast #datascience...crossfit
\n", + "
" + ], + "text/plain": [ + " screenname id_str text \\\n", + "0 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd... \n", + "1 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd... \n", + "2 nandanrao 928374987 Woah, pandas is so much fun #worldrocked #jawd... \n", + "3 om 98214039 I eat linear models for breakfast #datascience... \n", + "4 om 98214039 I eat linear models for breakfast #datascience... \n", + "5 om 98214039 I eat linear models for breakfast #datascience... \n", + "\n", + " hashtag \n", + "0 worldrocked \n", + "1 jawdrop \n", + "2 ml \n", + "3 datascience \n", + "4 ml \n", + "5 crossfit " + ] + }, + "execution_count": 203, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = tweets.merge(hashtags, how='left')\n", "\n", "df" ] }, + { + "cell_type": "code", + "execution_count": 205, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ml 2\n", + "datascience 1\n", + "crossfit 1\n", + "jawdrop 1\n", + "worldrocked 1\n", + "Name: hashtag, dtype: int64" + ] + }, + "execution_count": 205, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.hashtag.value_counts()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1218,6 +4411,958 @@ "\n", "*Needless to say that eyeballing is OK for making sure your code makes sense, but will not result in full credits for the project. We want a fully automated code. To carry out the project successfully you need to use most the attributes and methods described earlier. The last one is a little tricky*" ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ProductPrice
0tomato2.1
1potato3.4
2apple1.2
3orange4.3
4banana5.2
\n", + "
" + ], + "text/plain": [ + " Product Price\n", + "0 tomato 2.1\n", + "1 potato 3.4\n", + "2 apple 1.2\n", + "3 orange 4.3\n", + "4 banana 5.2" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prices=pd.read_csv(\"supermarket_prices.csv\")\n", + "prices.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BuyerProductQuantity
0Jacksonapple4
1Jacksonapple9
2Johnorange9
3Johnpotato10
4Tomtomato4
\n", + "
" + ], + "text/plain": [ + " Buyer Product Quantity\n", + "0 Jackson apple 4\n", + "1 Jackson apple 9\n", + "2 John orange 9\n", + "3 John potato 10\n", + "4 Tom tomato 4" + ] + }, + "execution_count": 157, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transactions=pd.read_csv(\"supermarket_transactions.csv\")\n", + "transactions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BuyerProductQuantityPrice
0Jacksonapple41.2
1Jacksonapple91.2
2Johnorange94.3
3Johnpotato103.4
4Tomtomato42.1
\n", + "
" + ], + "text/plain": [ + " Buyer Product Quantity Price\n", + "0 Jackson apple 4 1.2\n", + "1 Jackson apple 9 1.2\n", + "2 John orange 9 4.3\n", + "3 John potato 10 3.4\n", + "4 Tom tomato 4 2.1" + ] + }, + "execution_count": 158, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df=transactions.join(prices.set_index('Product'), on='Product')\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Buyer\n", + "Emma 81\n", + "Jackson 70\n", + "John 122\n", + "Liam 81\n", + "Lucas 62\n", + "Sandra 78\n", + "Sophia 61\n", + "Tom 49\n", + "Name: Quantity, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "\n", + "# How many items\n", + "display(df.groupby(\"Buyer\").Quantity.sum())\n", + "#display(df.columns)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Buyer Product\n", + "Emma apple 25\n", + " banana 26\n", + " potato 14\n", + " tomato 16\n", + "Jackson apple 18\n", + " orange 28\n", + " potato 8\n", + " tomato 16\n", + "John apple 7\n", + " banana 28\n", + " orange 46\n", + " potato 18\n", + " tomato 23\n", + "Liam apple 21\n", + " banana 16\n", + " orange 16\n", + " potato 21\n", + " tomato 7\n", + "Lucas apple 14\n", + " banana 3\n", + " orange 17\n", + " potato 9\n", + " tomato 19\n", + "Sandra banana 2\n", + " orange 37\n", + " potato 38\n", + " tomato 1\n", + "Sophia apple 14\n", + " banana 13\n", + " orange 7\n", + " potato 14\n", + " tomato 13\n", + "Tom apple 18\n", + " banana 6\n", + " potato 16\n", + " tomato 9\n", + "Name: Quantity, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# How many items\n", + "display(df.groupby([\"Buyer\",\"Product\"]).Quantity.sum())\n", + "#display(df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"bill\"]=df[\"Quantity\"]*df[\"Price\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Buyer', 'Product', 'Quantity', 'Price', 'bill'], dtype='object')" + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BuyerProductQuantityPricebill
0Jacksonapple41.24.8
1Jacksonapple91.210.8
2Johnorange94.338.7
3Johnpotato103.434.0
4Tomtomato42.18.4
\n", + "
" + ], + "text/plain": [ + " Buyer Product Quantity Price bill\n", + "0 Jackson apple 4 1.2 4.8\n", + "1 Jackson apple 9 1.2 10.8\n", + "2 John orange 9 4.3 38.7\n", + "3 John potato 10 3.4 34.0\n", + "4 Tom tomato 4 2.1 8.4" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Buyer\n", + "Emma 246.4\n", + "Jackson 202.8\n", + "John 461.3\n", + "Liam 263.3\n", + "Lucas 176.0\n", + "Sandra 300.8\n", + "Sophia 189.4\n", + "Tom 126.1\n", + "Name: bill, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "total_spent=df.groupby([\"Buyer\"]).bill.sum()\n", + "display(total_spent)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Buyer\n", + "Emma 135.2\n", + "John 145.6\n", + "Liam 83.2\n", + "Lucas 15.6\n", + "Sandra 10.4\n", + "Sophia 67.6\n", + "Tom 31.2\n", + "Name: bill, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "banana_spent=df[df.Product==\"banana\"].groupby([\"Buyer\"]).bill.sum()\n", + "display(banana_spent)" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['bill'], dtype='object')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Index(['Emma', 'Jackson', 'John', 'Liam', 'Lucas', 'Sandra', 'Sophia', 'Tom'], dtype='object', name='Buyer')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Index(['Emma', 'John', 'Liam', 'Lucas', 'Sandra', 'Sophia', 'Tom'], dtype='object', name='Buyer')" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bill_xbill_y
Buyer
Emma246.4135.2
Jackson202.8NaN
John461.3145.6
Liam263.383.2
Lucas176.015.6
Sandra300.810.4
Sophia189.467.6
Tom126.131.2
\n", + "
" + ], + "text/plain": [ + " bill_x bill_y\n", + "Buyer \n", + "Emma 246.4 135.2\n", + "Jackson 202.8 NaN\n", + "John 461.3 145.6\n", + "Liam 263.3 83.2\n", + "Lucas 176.0 15.6\n", + "Sandra 300.8 10.4\n", + "Sophia 189.4 67.6\n", + "Tom 126.1 31.2" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "total_spent=pd.DataFrame(total_spent)\n", + "banana_spent=pd.DataFrame(banana_spent)\n", + "\n", + "display(total_spent.columns)\n", + "display(total_spent.index)\n", + "display(banana_spent.index)\n", + "\n", + "\n", + "\n", + "spent_table=total_spent.merge(banana_spent,how='outer',left_index=True, right_index=True)\n", + "display(spent_table)" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Total_spentBanana_spent
Buyer
Emma246.4135.2
Jackson202.8NaN
John461.3145.6
Liam263.383.2
Lucas176.015.6
Sandra300.810.4
Sophia189.467.6
Tom126.131.2
\n", + "
" + ], + "text/plain": [ + " Total_spent Banana_spent\n", + "Buyer \n", + "Emma 246.4 135.2\n", + "Jackson 202.8 NaN\n", + "John 461.3 145.6\n", + "Liam 263.3 83.2\n", + "Lucas 176.0 15.6\n", + "Sandra 300.8 10.4\n", + "Sophia 189.4 67.6\n", + "Tom 126.1 31.2" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "spent_table.columns=[\"Total_spent\",\"Banana_spent\"]\n", + "\n", + "display(spent_table)" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Total_spentBanana_spentBanana_share
Buyer
Emma246.4135.20.548701
Jackson202.8NaNNaN
John461.3145.60.315630
Liam263.383.20.315989
Lucas176.015.60.088636
Sandra300.810.40.034574
Sophia189.467.60.356917
Tom126.131.20.247423
\n", + "
" + ], + "text/plain": [ + " Total_spent Banana_spent Banana_share\n", + "Buyer \n", + "Emma 246.4 135.2 0.548701\n", + "Jackson 202.8 NaN NaN\n", + "John 461.3 145.6 0.315630\n", + "Liam 263.3 83.2 0.315989\n", + "Lucas 176.0 15.6 0.088636\n", + "Sandra 300.8 10.4 0.034574\n", + "Sophia 189.4 67.6 0.356917\n", + "Tom 126.1 31.2 0.247423" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "spent_table[\"Banana_share\"]=spent_table[\"Banana_spent\"]/spent_table[\"Total_spent\"]\n", + "\n", + "display(spent_table)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Buyer\n", + "Liam 0.315989\n", + "Sophia 0.356917\n", + "Emma 0.548701\n", + "Name: Banana_share, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "banana_kings=spent_table.Banana_share.sort_values().iloc[-4:-1]\n", + "display(banana_kings)" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Buyer Product\n", + "Emma apple 30.0\n", + " banana 135.2\n", + " potato 47.6\n", + " tomato 33.6\n", + "Liam apple 25.2\n", + " banana 83.2\n", + " orange 68.8\n", + " potato 71.4\n", + " tomato 14.7\n", + "Sophia apple 16.8\n", + " banana 67.6\n", + " orange 30.1\n", + " potato 47.6\n", + " tomato 27.3\n", + "Name: bill, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#display(df[df[\"Buyer\"].isin(banana_kings.index)])\n", + "\n", + "\n", + "display(df[df[\"Buyer\"].isin(banana_kings.index)].groupby([\"Buyer\",\"Product\"]).bill.sum())\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1237,7 +5382,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.3" + "version": "3.7.3" } }, "nbformat": 4,