In [272]:
import timeit
import os

import numpy as np
import pandas as pd
import datetime as dt

from sqlalchemy import create_engine

# 20. Series - Part One

In [2]:
myindex = ['USA', 'Canada', 'Mexico']

In [3]:
mydata = [1776, 1867, 1821]

In [4]:
myser = pd.Series(data=mydata, index=myindex)

In [5]:
myser

USA       1776
Canada    1867
Mexico    1821
dtype: int64

In [6]:
myser.iloc[0]

1776

In [7]:
myser.loc['USA']

1776

In [8]:
ages = {'Sam':5, 'Frank':10, 'Spike':7}

In [9]:
pd.Series(ages)

Sam       5
Frank    10
Spike     7
dtype: int64

# Coding Exercise 4: Check-in: Labeled Index in Pandas Series

In [10]:
# TASK: Use pandas to grab the expenses paid by Bob.
# MAKE SURE TO READ THE FULL INSTRUCTIONS ABOVE CAREFULLY, AS THE EVALUATION SCRIPT IS VERY STRICT.
#  Link to Solution: https://gist.github.com/Pierian-Data/3d7f7cb3528f015d9584d04a7168b97f
expenses = pd.Series({'Andrew':200,'Bob':150,'Claire':450})

bob_expense = expenses.loc['Bob']
bob_expense

150

# 21. Series - Part Two

In [11]:
q1 = {'Japan': 80, 'China': 450, 'India': 200, 'USA': 250}
q2 = {'Brazil': 100, 'China': 500, 'India': 210, 'USA': 260}

In [12]:
sales_q1 = pd.Series(q1)

In [13]:
sales_q2 = pd.Series(q2)

In [14]:
sales_q1

Japan     80
China    450
India    200
USA      250
dtype: int64

In [15]:
sales_q2

Brazil    100
China     500
India     210
USA       260
dtype: int64

In [16]:
sales_q1['Japan']
sales_q1.iloc[0]

80

In [17]:
sales_q1.keys()

Index(['Japan', 'China', 'India', 'USA'], dtype='object')

In [18]:
sales_q1.values

array([ 80, 450, 200, 250], dtype=int64)

In [19]:
# [1, 2] * 2
np.array([1, 2]) * 2

array([2, 4])

In [20]:
# sales_q1 * 2
sales_q1 / 100

Japan    0.8
China    4.5
India    2.0
USA      2.5
dtype: float64

In [21]:
sales_q1 + sales_q2 # NaN is added for the missing values in the series

Brazil      NaN
China     950.0
India     410.0
Japan       NaN
USA       510.0
dtype: float64

In [22]:
first_half = sales_q1.add(sales_q2, fill_value=0) # fill_value=0 is used to fill the missing values with 0

In [23]:
sales_q1.dtypes

dtype('int64')

In [24]:
first_half.dtypes

dtype('float64')

# 22. DataFrames - Part One - Creating a DataFrame

In [25]:
np.random.seed(101) # to get the same random numbers
mydata = np.random.randint(0, 101, (4, 3))
mydata

array([[95, 11, 81],
       [70, 63, 87],
       [75,  9, 77],
       [40,  4, 63]])

In [26]:
myindex = ['CA', 'NY', 'AZ', 'TX']

In [27]:
mycolumns = ['Jan', 'Feb', 'Mar']

In [28]:
df = pd.DataFrame(data=mydata, index=myindex, columns=mycolumns)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, CA to TX
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Jan     4 non-null      int32
 1   Feb     4 non-null      int32
 2   Mar     4 non-null      int32
dtypes: int32(3)
memory usage: 80.0+ bytes


In [30]:
df = pd.read_csv(r'data_frame\tips.csv')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.50,Male,No,Sun,Dinner,3,7.00,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251
...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842,Sat2657
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196,Sat3880
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17


# 23. DataFrames - Part Two - Basic Properties

In [31]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size',
       'price_per_person', 'Payer Name', 'CC Number', 'Payment ID'],
      dtype='object')

In [32]:
df.index

RangeIndex(start=0, stop=244, step=1)

In [33]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [34]:
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842,Sat2657
240,27.18,2.0,Female,Yes,Sat,Dinner,2,13.59,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.0,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196,Sat3880
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17
243,18.78,3.0,Female,No,Thur,Dinner,2,9.39,Michelle Hardin,3511451626698139,Thur672


In [35]:
df.describe().transpose() # transpose() is used to make the output more readable by switching the rows and columns

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.78594,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9510998,1.0,2.0,2.0,3.0,6.0
price_per_person,244.0,7.888197,2.914234,2.88,5.8,7.255,9.39,20.27
CC Number,244.0,2563496000000000.0,2369340000000000.0,60406790000.0,30407310000000.0,3525318000000000.0,4553675000000000.0,6596454000000000.0


# 24. DataFrames - Part Three - Working with Columns

# Columns

In [36]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [37]:
df['total_bill']

0      16.99
1      10.34
2      21.01
3      23.68
4      24.59
       ...  
239    29.03
240    27.18
241    22.67
242    17.82
243    18.78
Name: total_bill, Length: 244, dtype: float64

In [38]:
type(df['total_bill'])

pandas.core.series.Series

In [39]:
mycols = ['total_bill', 'tip']
df[mycols]

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.50
3,23.68,3.31
4,24.59,3.61
...,...,...
239,29.03,5.92
240,27.18,2.00
241,22.67,2.00
242,17.82,1.75


In [40]:
df[['total_bill', 'tip']]

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.50
3,23.68,3.31
4,24.59,3.61
...,...,...
239,29.03,5.92
240,27.18,2.00
241,22.67,2.00
242,17.82,1.75


In [41]:
# df_percentage = df['tip'] / df['total_bill']  * 100
df_percentage = df.apply(lambda row: (row['tip'] / row['total_bill']) * 100, axis=1)
df['tip_percentage'] = df_percentage

In [42]:
df['price_per_person'] = df['size'] / df['total_bill']
# If overate the column name, name will be changed

In [43]:
df['price_per_person'] = df['size'] / df['total_bill']
# If overate the column name, name will be changed

In [44]:
np.round(df['price_per_person'], 2)

0      0.12
1      0.29
2      0.14
3      0.08
4      0.16
       ... 
239    0.10
240    0.07
241    0.09
242    0.11
243    0.11
Name: price_per_person, Length: 244, dtype: float64

In [45]:
df['price_per_person'] = np.round(df['price_per_person'], 2)

In [46]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,tip_percentage
0,16.99,1.01,Female,No,Sun,Dinner,2,0.12,Christy Cunningham,3560325168603410,Sun2959,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,0.29,Douglas Tucker,4478071379779230,Sun4608,16.054159
2,21.01,3.50,Male,No,Sun,Dinner,3,0.14,Travis Walters,6011812112971322,Sun4458,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,0.08,Nathaniel Harris,4676137647685994,Sun5260,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,0.16,Tonya Carter,4832732618637221,Sun2251,14.680765
...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.10,Michael Avila,5296068606052842,Sat2657,20.392697
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.07,Monica Sanders,3506806155565404,Sat1766,7.358352
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.09,Keith Wong,6011891618747196,Sat3880,8.822232
242,17.82,1.75,Male,No,Sat,Dinner,2,0.11,Dennis Dixon,4375220550950,Sat17,9.820426


In [47]:
df.drop('tip_percentage', axis=1, inplace=True)
df = df.drop('tip_percentage', axis=1) # This is the same as the above line recommended

KeyError: "['tip_percentage'] not found in axis"

In [None]:
df.shape # axis = 1 is for columns and axis = 0 is for rows because the shape is (rows, columns)

# 25. DataFrames - Part Four - Working with Rows

In [None]:
df = df.set_index('Payment ID')

In [None]:
df.reset_index()

In [None]:
df.iloc[0]

In [None]:
df.loc['Sun2959']

In [None]:
df.iloc[1:4]

In [48]:
df.loc[['Sun2959', 'Sun4608']]

KeyError: "None of [Index(['Sun2959', 'Sun4608'], dtype='object')] are in the [index]"

In [49]:
df = df.drop('Sun2959', axis=0)

KeyError: "['Sun2959'] not found in axis"

In [50]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,0.12,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,0.29,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,0.14,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,0.08,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,0.16,Tonya Carter,4832732618637221,Sun2251


In [51]:
df = df.iloc[1:]

In [52]:
df # select lows is more efficient than drop

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
1,10.34,1.66,Male,No,Sun,Dinner,3,0.29,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.50,Male,No,Sun,Dinner,3,0.14,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,0.08,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,0.16,Tonya Carter,4832732618637221,Sun2251
5,25.29,4.71,Male,No,Sun,Dinner,4,0.16,Erik Smith,213140353657882,Sun9679
...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.10,Michael Avila,5296068606052842,Sat2657
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.07,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.09,Keith Wong,6011891618747196,Sat3880
242,17.82,1.75,Male,No,Sat,Dinner,2,0.11,Dennis Dixon,4375220550950,Sat17


In [53]:
one_row = df.iloc[0]

In [54]:
one_row

total_bill                     10.34
tip                             1.66
sex                             Male
smoker                            No
day                              Sun
time                          Dinner
size                               3
price_per_person                0.29
Payer Name            Douglas Tucker
CC Number           4478071379779230
Payment ID                   Sun4608
Name: 1, dtype: object

In [55]:
# df = df.append(one_row)

# 26. Pandas - Conditional Filtering

In [56]:
df[df['total_bill'] > 40]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
59,48.27,6.73,Male,No,Sat,Dinner,4,0.08,Brian Ortiz,6596453823950595,Sat8139
95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.1,Aaron Bentley,180026611638690,Fri9628
102,44.3,2.5,Female,Yes,Sat,Dinner,3,0.07,Heather Cohen,379771118886604,Sat6240
142,41.19,5.0,Male,No,Thur,Lunch,5,0.12,Eric Andrews,4356531761046453,Thur3621
156,48.17,5.0,Male,No,Sun,Dinner,6,0.12,Ryan Gonzales,3523151482063321,Sun7518
170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.06,Gregory Clark,5473850968388236,Sat1954
182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.07,Jose Parsons,4112207559459910,Sun2337
184,40.55,3.0,Male,Yes,Sun,Dinner,2,0.05,Stephen Cox,3547798222044029,Sun5140
197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.09,Brooke Soto,5544902205760175,Thur9313
212,48.33,9.0,Male,No,Sat,Dinner,4,0.08,Alex Williamson,676218815212,Sat4590


In [57]:
df[df['sex'] == 'Male']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
1,10.34,1.66,Male,No,Sun,Dinner,3,0.29,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.50,Male,No,Sun,Dinner,3,0.14,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,0.08,Nathaniel Harris,4676137647685994,Sun5260
5,25.29,4.71,Male,No,Sun,Dinner,4,0.16,Erik Smith,213140353657882,Sun9679
6,8.77,2.00,Male,No,Sun,Dinner,2,0.23,Kristopher Johnson,2223727524230344,Sun5985
...,...,...,...,...,...,...,...,...,...,...,...
236,12.60,1.00,Male,Yes,Sat,Dinner,2,0.16,Matthew Myers,3543676378973965,Sat5032
237,32.83,1.17,Male,Yes,Sat,Dinner,2,0.06,Thomas Brown,4284722681265508,Sat2929
239,29.03,5.92,Male,No,Sat,Dinner,3,0.10,Michael Avila,5296068606052842,Sat2657
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.09,Keith Wong,6011891618747196,Sat3880


In [58]:
df[df['size'] > 3]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
4,24.59,3.61,Female,No,Sun,Dinner,4,0.16,Tonya Carter,4832732618637221,Sun2251
5,25.29,4.71,Male,No,Sun,Dinner,4,0.16,Erik Smith,213140353657882,Sun9679
7,26.88,3.12,Male,No,Sun,Dinner,4,0.15,Robert Buck,3514785077705092,Sun8157
11,35.26,5.0,Female,No,Sun,Dinner,4,0.11,Diane Macias,4577817359320969,Sun6686
13,18.43,3.0,Male,No,Sun,Dinner,4,0.22,Joshua Jones,6011163105616890,Sun2971
23,39.42,7.58,Male,No,Sat,Dinner,4,0.1,Lance Peterson,3542584061609808,Sat239
25,17.81,2.34,Male,No,Sat,Dinner,4,0.22,Robert Perkins,30502930499388,Sat907
31,18.35,2.5,Male,No,Sat,Dinner,4,0.22,Danny Santiago,630415546013,Sat4947
33,20.69,2.45,Female,No,Sat,Dinner,4,0.19,Amber Francis,377742985258914,Sat6649
44,30.4,5.6,Male,No,Sun,Dinner,4,0.13,Todd Cooper,503846761263,Sun2274


In [59]:
df[(df['total_bill'] > 30) & (df['sex'] == 'Male')]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
23,39.42,7.58,Male,No,Sat,Dinner,4,0.1,Lance Peterson,3542584061609808,Sat239
39,31.27,5.0,Male,No,Sat,Dinner,3,0.1,Mr. Brandon Berry,6011525851069856,Sat6373
44,30.4,5.6,Male,No,Sun,Dinner,4,0.13,Todd Cooper,503846761263,Sun2274
47,32.4,6.0,Male,No,Sun,Dinner,4,0.12,James Barnes,3552002592874186,Sun9677
56,38.01,3.0,Male,Yes,Sat,Dinner,4,0.11,James Christensen DDS,349793629453226,Sat8903
59,48.27,6.73,Male,No,Sat,Dinner,4,0.08,Brian Ortiz,6596453823950595,Sat8139
83,32.68,5.0,Male,Yes,Thur,Lunch,2,0.06,Daniel Murphy,5356177501009133,Thur8801
95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.1,Aaron Bentley,180026611638690,Fri9628
112,38.07,4.0,Male,No,Sun,Dinner,3,0.08,Jeff Lopez,3572865915176463,Sun591
141,34.3,6.7,Male,No,Thur,Lunch,6,0.17,Steven Carlson,3526515703718508,Thur1025


In [60]:
df[(df['day'] == 'Sun') | (df['day'] == 'Sat') | (df['day'] == 'Fri')]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
1,10.34,1.66,Male,No,Sun,Dinner,3,0.29,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.50,Male,No,Sun,Dinner,3,0.14,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,0.08,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,0.16,Tonya Carter,4832732618637221,Sun2251
5,25.29,4.71,Male,No,Sun,Dinner,4,0.16,Erik Smith,213140353657882,Sun9679
...,...,...,...,...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3,0.08,Kimberly Crane,676184013727,Sat9777
239,29.03,5.92,Male,No,Sat,Dinner,3,0.10,Michael Avila,5296068606052842,Sat2657
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.07,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.09,Keith Wong,6011891618747196,Sat3880


In [61]:
options = ['Sat', 'Sun', 'Fri']
df[df['day'].isin(options)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
1,10.34,1.66,Male,No,Sun,Dinner,3,0.29,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.50,Male,No,Sun,Dinner,3,0.14,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,0.08,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,0.16,Tonya Carter,4832732618637221,Sun2251
5,25.29,4.71,Male,No,Sun,Dinner,4,0.16,Erik Smith,213140353657882,Sun9679
...,...,...,...,...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3,0.08,Kimberly Crane,676184013727,Sat9777
239,29.03,5.92,Male,No,Sat,Dinner,3,0.10,Michael Avila,5296068606052842,Sat2657
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.07,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.09,Keith Wong,6011891618747196,Sat3880


# 27. Pandas - Useful Methods - Apply on Single Column

In [62]:
def last_four(number):
  return int(str(number)[-4:])

df['last_four']= df['CC Number'].apply(last_four)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['last_four']= df['CC Number'].apply(last_four)


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,last_four
1,10.34,1.66,Male,No,Sun,Dinner,3,0.29,Douglas Tucker,4478071379779230,Sun4608,9230
2,21.01,3.50,Male,No,Sun,Dinner,3,0.14,Travis Walters,6011812112971322,Sun4458,1322
3,23.68,3.31,Male,No,Sun,Dinner,2,0.08,Nathaniel Harris,4676137647685994,Sun5260,5994
4,24.59,3.61,Female,No,Sun,Dinner,4,0.16,Tonya Carter,4832732618637221,Sun2251,7221
5,25.29,4.71,Male,No,Sun,Dinner,4,0.16,Erik Smith,213140353657882,Sun9679,7882
...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.10,Michael Avila,5296068606052842,Sat2657,2842
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.07,Monica Sanders,3506806155565404,Sat1766,5404
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.09,Keith Wong,6011891618747196,Sat3880,7196
242,17.82,1.75,Male,No,Sat,Dinner,2,0.11,Dennis Dixon,4375220550950,Sat17,950


In [63]:
df['total_bill']

1      10.34
2      21.01
3      23.68
4      24.59
5      25.29
       ...  
239    29.03
240    27.18
241    22.67
242    17.82
243    18.78
Name: total_bill, Length: 243, dtype: float64

In [64]:
def yelp(price):
  if price < 10: return '$'
  elif 10 <= price <= 30: return '$$'
  else: return '$$$'

In [65]:
df['yelp'] = df['total_bill'].apply(yelp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['yelp'] = df['total_bill'].apply(yelp)


In [66]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,last_four,yelp
1,10.34,1.66,Male,No,Sun,Dinner,3,0.29,Douglas Tucker,4478071379779230,Sun4608,9230,$$
2,21.01,3.50,Male,No,Sun,Dinner,3,0.14,Travis Walters,6011812112971322,Sun4458,1322,$$
3,23.68,3.31,Male,No,Sun,Dinner,2,0.08,Nathaniel Harris,4676137647685994,Sun5260,5994,$$
4,24.59,3.61,Female,No,Sun,Dinner,4,0.16,Tonya Carter,4832732618637221,Sun2251,7221,$$
5,25.29,4.71,Male,No,Sun,Dinner,4,0.16,Erik Smith,213140353657882,Sun9679,7882,$$
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.10,Michael Avila,5296068606052842,Sat2657,2842,$$
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.07,Monica Sanders,3506806155565404,Sat1766,5404,$$
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.09,Keith Wong,6011891618747196,Sat3880,7196,$$
242,17.82,1.75,Male,No,Sat,Dinner,2,0.11,Dennis Dixon,4375220550950,Sat17,950,$$


# 28. Pandas - Useful Methods - Apply on Multiple Columns

In [67]:
def simple(number):
  return number *2

In [68]:
# lambda number: number * 2

In [69]:
simple(2)

4

In [70]:
df['total_bill'].apply(simple)

1      20.68
2      42.02
3      47.36
4      49.18
5      50.58
       ...  
239    58.06
240    54.36
241    45.34
242    35.64
243    37.56
Name: total_bill, Length: 243, dtype: float64

In [71]:
df['total_bill'].apply(lambda number: number * 2)

1      20.68
2      42.02
3      47.36
4      49.18
5      50.58
       ...  
239    58.06
240    54.36
241    45.34
242    35.64
243    37.56
Name: total_bill, Length: 243, dtype: float64

In [72]:
def quality(total_bill, tip):
  if tip / total_bill > 0.25: return 'Generous'
  else: return 'Others'

In [73]:
quality(16.99, 1.01)

'Others'

In [74]:
df['quality'] = df[['total_bill', 'tip']].apply(lambda df: quality(df['total_bill'], df['tip']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['quality'] = df[['total_bill', 'tip']].apply(lambda df: quality(df['total_bill'], df['tip']), axis=1)


In [75]:
df['quality'] = np.vectorize(quality)(df['total_bill'], df['tip'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['quality'] = np.vectorize(quality)(df['total_bill'], df['tip'])


In [76]:
setup = '''
import numpy as np
import pandas as pd
df = pd.read_csv(r'data_frame\\tips.csv')
def quality(total_bill,tip):
    if tip/total_bill  > 0.25:
        return "Generous"
    else:
        return "Other"
'''

# code snippet whose execution time is to be measured 
stmt_one = ''' 
df['Tip Quality'] = df[['total_bill','tip']].apply(lambda df: quality(df['total_bill'],df['tip']),axis=1)
'''

stmt_two = '''
df['Tip Quality'] = np.vectorize(quality)(df['total_bill'], df['tip'])
'''
  

In [77]:
timeit.timeit(setup = setup,
              stmt = stmt_one,
              number = 1000) 

1.2527011999991373

In [78]:
timeit.timeit(setup = setup,
              stmt = stmt_two,
              number = 1000) 

0.18446989999938523

# 29. Pandas - Useful Methods - Statistical Information and Sorting

In [79]:
df = pd.read_csv(r'data_frame\tips.csv')

In [80]:
df.describe()

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
count,244.0,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,7.888197,2563496000000000.0
std,8.902412,1.383638,0.9511,2.914234,2369340000000000.0
min,3.07,1.0,1.0,2.88,60406790000.0
25%,13.3475,2.0,2.0,5.8,30407310000000.0
50%,17.795,2.9,2.0,7.255,3525318000000000.0
75%,24.1275,3.5625,3.0,9.39,4553675000000000.0
max,50.81,10.0,6.0,20.27,6596454000000000.0


In [81]:
df.sort_values(by=['tip', 'size'], ascending=False)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
170,50.81,10.00,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954
212,48.33,9.00,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590
23,39.42,7.58,Male,No,Sat,Dinner,4,9.86,Lance Peterson,3542584061609808,Sat239
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139
141,34.30,6.70,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Thur1025
...,...,...,...,...,...,...,...,...,...,...,...
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
92,5.75,1.00,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Fri3780
236,12.60,1.00,Male,Yes,Sat,Dinner,2,6.30,Matthew Myers,3543676378973965,Sat5032
67,3.07,1.00,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455


In [82]:
df['total_bill'].max()

50.81

In [83]:
df['total_bill'].idxmax()

170

In [84]:
df.loc[170]

total_bill                     50.81
tip                             10.0
sex                             Male
smoker                           Yes
day                              Sat
time                          Dinner
size                               3
price_per_person               16.94
Payer Name             Gregory Clark
CC Number           5473850968388236
Payment ID                   Sat1954
Name: 170, dtype: object

In [85]:
df['total_bill'].min()

3.07

In [86]:
df['total_bill'].idxmin()

67

In [87]:
df.loc[67]

total_bill                      3.07
tip                              1.0
sex                           Female
smoker                           Yes
day                              Sat
time                          Dinner
size                               1
price_per_person                3.07
Payer Name             Tiffany Brock
CC Number           4359488526995267
Payment ID                   Sat3455
Name: 67, dtype: object

In [88]:
numeric_df = df.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()
correlation_matrix

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
total_bill,1.0,0.675734,0.598315,0.647554,0.104576
tip,0.675734,1.0,0.489299,0.347405,0.110857
size,0.598315,0.489299,1.0,-0.175359,-0.030239
price_per_person,0.647554,0.347405,-0.175359,1.0,0.13524
CC Number,0.104576,0.110857,-0.030239,0.13524,1.0


In [89]:
df.select_dtypes(include=[np.number]).corr()

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
total_bill,1.0,0.675734,0.598315,0.647554,0.104576
tip,0.675734,1.0,0.489299,0.347405,0.110857
size,0.598315,0.489299,1.0,-0.175359,-0.030239
price_per_person,0.647554,0.347405,-0.175359,1.0,0.13524
CC Number,0.104576,0.110857,-0.030239,0.13524,1.0


In [90]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [91]:
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [92]:
df['day'].unique()

array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

In [93]:
df['day'].nunique()

4

In [94]:
df['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [95]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [96]:
df['sex'].replace(['Female', 'Male'], ['F', 'M'])

0      F
1      M
2      M
3      M
4      F
      ..
239    M
240    F
241    M
242    M
243    F
Name: sex, Length: 244, dtype: object

In [97]:
mymap = {'Female': 'F', 'Male': 'M'}

In [98]:
df['sex'].map(mymap)

0      F
1      M
2      M
3      M
4      F
      ..
239    M
240    F
241    M
242    M
243    F
Name: sex, Length: 244, dtype: object

In [99]:
df[df.duplicated()]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID


In [100]:
simple_df = pd.DataFrame([1, 2, 2], ['a', 'b', 'c'])

In [101]:
simple_df.duplicated()

a    False
b    False
c     True
dtype: bool

In [102]:
simple_df.drop_duplicates()

Unnamed: 0,0
a,1
b,2


In [103]:
df['total_bill'].between(10, 20, inclusive='both')

0       True
1       True
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242     True
243     True
Name: total_bill, Length: 244, dtype: bool

In [104]:
df['total_bill'].between(10, 20, inclusive='neither')

0       True
1       True
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242     True
243     True
Name: total_bill, Length: 244, dtype: bool

In [105]:
df['total_bill'].between(10, 20, inclusive='left')

0       True
1       True
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242     True
243     True
Name: total_bill, Length: 244, dtype: bool

In [106]:
df['total_bill'].between(10, 20, inclusive='right')

0       True
1       True
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242     True
243     True
Name: total_bill, Length: 244, dtype: bool

In [107]:
df.nlargest(10, 'total_bill')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
170,50.81,10.0,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954
212,48.33,9.0,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139
156,48.17,5.0,Male,No,Sun,Dinner,6,8.03,Ryan Gonzales,3523151482063321,Sun7518
182,45.35,3.5,Male,Yes,Sun,Dinner,3,15.12,Jose Parsons,4112207559459910,Sun2337
102,44.3,2.5,Female,Yes,Sat,Dinner,3,14.77,Heather Cohen,379771118886604,Sat6240
197,43.11,5.0,Female,Yes,Thur,Lunch,4,10.78,Brooke Soto,5544902205760175,Thur9313
142,41.19,5.0,Male,No,Thur,Lunch,5,8.24,Eric Andrews,4356531761046453,Thur3621
184,40.55,3.0,Male,Yes,Sun,Dinner,2,20.27,Stephen Cox,3547798222044029,Sun5140
95,40.17,4.73,Male,Yes,Fri,Dinner,4,10.04,Aaron Bentley,180026611638690,Fri9628


In [108]:
df.nsmallest(10, 'total_bill')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
67,3.07,1.0,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455
92,5.75,1.0,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Fri3780
111,7.25,1.0,Female,No,Sat,Dinner,1,7.25,Terri Jones,3559221007826887,Sat4801
172,7.25,5.15,Male,Yes,Sun,Dinner,2,3.62,Larry White,30432617123103,Sun9209
149,7.51,2.0,Male,No,Thur,Lunch,2,3.76,Daniel Robbins,4823139288341889,Thur6321
195,7.56,1.44,Male,No,Thur,Lunch,2,3.78,Michael White,4865390263095532,Thur697
218,7.74,1.44,Male,Yes,Sat,Dinner,2,3.87,Nicholas Archer,340517153733524,Sat4772
145,8.35,1.5,Female,No,Thur,Lunch,2,4.18,Amy Young,4285454264477,Thur9331
135,8.51,1.25,Female,No,Thur,Lunch,2,4.26,Rebecca Harris,4320272020376174,Thur6600
126,8.52,1.48,Male,No,Thur,Lunch,2,4.26,Mario Bradshaw,4524404353861811,Thur6719


In [109]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   price_per_person  244 non-null    float64
 8   Payer Name        244 non-null    object 
 9   CC Number         244 non-null    int64  
 10  Payment ID        244 non-null    object 
dtypes: float64(3), int64(2), object(6)
memory usage: 88.3 KB


In [110]:
df.sample(n=5) # random sample of 5 rows

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
169,10.63,2.0,Female,Yes,Sat,Dinner,2,5.32,Amy Hill,3536332481454019,Sat1788
161,12.66,2.5,Male,No,Sun,Dinner,2,6.33,Brandon Oconnor,4406882156920533,Sun5879
69,15.01,2.09,Male,Yes,Sat,Dinner,2,7.5,Adam Hall,4700924377057571,Sat855
222,8.58,1.92,Male,Yes,Fri,Lunch,1,8.58,Jason Lawrence,3505302934650403,Fri6624
93,16.32,4.3,Female,Yes,Fri,Dinner,2,8.16,Natalie Nguyen,5181236182893396,Fri6963


In [111]:
df.sample(frac=0.1) # random sample of 10% of the rows

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
32,15.06,3.0,Female,No,Sat,Dinner,2,7.53,Amanda Wilson,213186304291560,Sat1327
215,12.9,1.1,Female,Yes,Sat,Dinner,2,6.45,Jessica Owen,4726904879471,Sat6983
213,13.27,2.5,Female,Yes,Sat,Dinner,2,6.64,Robin Andersen,580140531089,Sat1374
106,20.49,4.06,Male,Yes,Sat,Dinner,2,10.24,Karl Mcdaniel,180024452771522,Sat7865
143,27.05,5.0,Female,No,Thur,Lunch,6,4.51,Regina Jones,4311048695487,Thur6179
98,21.01,3.0,Male,Yes,Fri,Dinner,2,10.5,Michael Li,4831801127457917,Fri144
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251
34,17.78,3.27,Male,No,Sat,Dinner,2,8.89,Jacob Castillo,3551492000704805,Sat8124
18,16.97,3.5,Female,No,Sun,Dinner,3,5.66,Laura Martinez,30422275171379,Sun2789
138,16.0,2.0,Male,Yes,Thur,Lunch,2,8.0,Jason Burgess,3561461821942363,Thur2710


# 30. Missing Data - Overview

# 31. Missing Data - Pandas Operations

In [112]:
np.nan

nan

In [113]:
pd.NA

<NA>

In [114]:
pd.NaT

NaT

In [115]:
np.nan == np.nan # False because NaN is not equal to NaN

False

In [116]:
np.nan is np.nan # True because NaN is NaN

True

In [117]:
myvar = np.nan

In [118]:
df = pd.read_csv(r'data_frame\movie_scores.csv')

In [119]:
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [120]:
df.isnull()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,False,False,False,False,False,False
1,True,True,True,True,True,True
2,False,False,False,False,True,True
3,False,False,False,False,False,False
4,False,False,False,False,False,False


In [121]:
df.isnull().sum()

first_name          1
last_name           1
age                 1
sex                 1
pre_movie_score     2
post_movie_score    2
dtype: int64

In [122]:
df['pre_movie_score'].notnull()

0     True
1    False
2    False
3     True
4     True
Name: pre_movie_score, dtype: bool

In [123]:
df[df['pre_movie_score'].isnull()]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
1,,,,,,
2,Hugh,Jackman,51.0,m,,


In [124]:
# KEEP THE DATA 
# DROP THE DATA
# FILL THE DATA


In [125]:
help(df.dropna)

Help on method dropna in module pandas.core.frame:

dropna(*, axis: 'Axis' = 0, how: 'AnyAll | lib.NoDefault' = <no_default>, thresh: 'int | lib.NoDefault' = <no_default>, subset: 'IndexLabel | None' = None, inplace: 'bool' = False, ignore_index: 'bool' = False) -> 'DataFrame | None' method of pandas.core.frame.DataFrame instance
    Remove missing values.

    See the :ref:`User Guide <missing_data>` for more on which values are
    considered missing, and how to work with missing data.

    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Determine if rows or columns which contain missing values are
        removed.

        * 0, or 'index' : Drop rows which contain missing values.
        * 1, or 'columns' : Drop columns which contain missing value.

        Only a single axis is allowed.

    how : {'any', 'all'}, default 'any'
        Determine if row or column is removed from DataFrame, when we have
        at least one NA or all NA.

       

In [126]:
df.dropna()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [127]:
df.dropna(thresh=4)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [128]:
df.dropna(axis=1) # drop columns with missing values

0
1
2
3
4


In [129]:
df.dropna(axis=1, thresh=4)

Unnamed: 0,first_name,last_name,age,sex
0,Tom,Hanks,63.0,m
1,,,,
2,Hugh,Jackman,51.0,m
3,Oprah,Winfrey,66.0,f
4,Emma,Stone,31.0,f


In [130]:
df.dropna(subset=['pre_movie_score'])

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [131]:
df.fillna('NEW VALUE!')

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,NEW VALUE!,NEW VALUE!,NEW VALUE!,NEW VALUE!,NEW VALUE!,NEW VALUE!
2,Hugh,Jackman,51.0,m,NEW VALUE!,NEW VALUE!
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [132]:
df['pre_movie_score'] = df['pre_movie_score'].fillna(df['pre_movie_score'].mean())

In [133]:
df['pre_movie_score'].fillna(df['pre_movie_score'].mean())

0    8.0
1    7.0
2    7.0
3    6.0
4    7.0
Name: pre_movie_score, dtype: float64

In [134]:
# df.fillna(df.mean())

In [135]:
airline_tix = {'first':100, 'business':np.nan, 'economy-plus':50}

In [136]:
ser = pd.Series(airline_tix)

In [137]:
ser.fillna(ser.mean())

first           100.0
business         75.0
economy-plus     50.0
dtype: float64

In [138]:
ser.interpolate()

first           100.0
business         75.0
economy-plus     50.0
dtype: float64

# 32. GroupBy Operations - Part One

In [139]:
df = pd.read_csv(r'data_frame\mpg.csv')

In [140]:
df['model_year'].unique()

array([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82], dtype=int64)

In [141]:
df.groupby('model_year')['mpg'].mean()

model_year
70    17.689655
71    21.250000
72    18.714286
73    17.100000
74    22.703704
75    20.266667
76    21.573529
77    23.375000
78    24.061111
79    25.093103
80    33.696552
81    30.334483
82    31.709677
Name: mpg, dtype: float64

In [142]:
numeric_cols = df.select_dtypes(include=[np.number]).columns

df.groupby('model_year')[numeric_cols].mean()

Unnamed: 0_level_0,mpg,cylinders,displacement,weight,acceleration,model_year,origin
model_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
70,17.689655,6.758621,281.413793,3372.793103,12.948276,70.0,1.310345
71,21.25,5.571429,209.75,2995.428571,15.142857,71.0,1.428571
72,18.714286,5.821429,218.375,3237.714286,15.125,72.0,1.535714
73,17.1,6.375,256.875,3419.025,14.3125,73.0,1.375
74,22.703704,5.259259,171.740741,2877.925926,16.203704,74.0,1.666667
75,20.266667,5.6,205.533333,3176.8,16.05,75.0,1.466667
76,21.573529,5.647059,197.794118,3078.735294,15.941176,76.0,1.470588
77,23.375,5.464286,191.392857,2997.357143,15.435714,77.0,1.571429
78,24.061111,5.361111,177.805556,2861.805556,15.805556,78.0,1.611111
79,25.093103,5.827586,206.689655,3055.344828,15.813793,79.0,1.275862


In [143]:
to_numeric = pd.to_numeric(df)

TypeError: arg must be a list, tuple, 1-d array, or Series

In [144]:
df.groupby('model_year')[numeric_cols].describe().transpose()

Unnamed: 0,model_year,70,71,72,73,74,75,76,77,78,79,80,81,82
mpg,count,29.0,28.0,28.0,40.0,27.0,30.0,34.0,28.0,36.0,29.0,29.0,29.0,31.0
mpg,mean,17.689655,21.25,18.714286,17.1,22.703704,20.266667,21.573529,23.375,24.061111,25.093103,33.696552,30.334483,31.709677
mpg,std,5.339231,6.591942,5.435529,4.700245,6.42001,4.940566,5.889297,6.675862,6.898044,6.794217,7.037983,5.591465,5.392548
mpg,min,9.0,12.0,11.0,11.0,13.0,13.0,13.0,15.0,16.2,15.5,19.1,17.6,22.0
mpg,25%,14.0,15.5,13.75,13.0,16.0,16.0,16.75,17.375,19.35,19.2,29.8,26.6,27.0
mpg,50%,16.0,19.0,18.5,16.0,24.0,19.5,21.0,21.75,20.7,23.9,32.7,31.6,32.0
mpg,75%,22.0,27.0,23.0,20.0,27.0,23.0,26.375,30.0,28.0,31.8,38.1,34.4,36.0
mpg,max,27.0,35.0,28.0,29.0,32.0,33.0,33.0,36.0,43.1,37.3,46.6,39.1,44.0
cylinders,count,29.0,28.0,28.0,40.0,27.0,30.0,34.0,28.0,36.0,29.0,29.0,29.0,31.0
cylinders,mean,6.758621,5.571429,5.821429,6.375,5.259259,5.6,5.647059,5.464286,5.361111,5.827586,4.137931,4.62069,4.193548


In [145]:
df.groupby('model_year')['mpg'].mean()

model_year
70    17.689655
71    21.250000
72    18.714286
73    17.100000
74    22.703704
75    20.266667
76    21.573529
77    23.375000
78    24.061111
79    25.093103
80    33.696552
81    30.334483
82    31.709677
Name: mpg, dtype: float64

In [146]:
df.groupby(['cylinders', 'model_year'])['mpg'].mean()

cylinders  model_year
3          72            19.000000
           73            18.000000
           77            21.500000
           80            23.700000
4          70            25.285714
           71            27.461538
           72            23.428571
           73            22.727273
           74            27.800000
           75            25.250000
           76            26.766667
           77            29.107143
           78            29.576471
           79            31.525000
           80            34.612000
           81            32.814286
           82            32.071429
5          78            20.300000
           79            25.400000
           80            36.400000
6          70            20.500000
           71            18.000000
           73            19.000000
           74            17.857143
           75            17.583333
           76            20.000000
           77            19.500000
           78            19.06666

In [147]:
df.groupby('model_year').describe()

Unnamed: 0_level_0,mpg,mpg,mpg,mpg,mpg,mpg,mpg,mpg,cylinders,cylinders,...,acceleration,acceleration,origin,origin,origin,origin,origin,origin,origin,origin
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
model_year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
70,29.0,17.689655,5.339231,9.0,14.0,16.0,22.0,27.0,29.0,6.758621,...,15.0,20.5,29.0,1.310345,0.603765,1.0,1.0,1.0,1.0,3.0
71,28.0,21.25,6.591942,12.0,15.5,19.0,27.0,35.0,28.0,5.571429,...,16.125,20.5,28.0,1.428571,0.741798,1.0,1.0,1.0,2.0,3.0
72,28.0,18.714286,5.435529,11.0,13.75,18.5,23.0,28.0,28.0,5.821429,...,16.625,23.5,28.0,1.535714,0.792658,1.0,1.0,1.0,2.0,3.0
73,40.0,17.1,4.700245,11.0,13.0,16.0,20.0,29.0,40.0,6.375,...,16.0,21.0,40.0,1.375,0.667467,1.0,1.0,1.0,2.0,3.0
74,27.0,22.703704,6.42001,13.0,16.0,24.0,27.0,32.0,27.0,5.259259,...,17.0,21.0,27.0,1.666667,0.83205,1.0,1.0,1.0,2.0,3.0
75,30.0,20.266667,4.940566,13.0,16.0,19.5,23.0,33.0,30.0,5.6,...,17.375,21.0,30.0,1.466667,0.730297,1.0,1.0,1.0,2.0,3.0
76,34.0,21.573529,5.889297,13.0,16.75,21.0,26.375,33.0,34.0,5.647059,...,17.55,22.2,34.0,1.470588,0.706476,1.0,1.0,1.0,2.0,3.0
77,28.0,23.375,6.675862,15.0,17.375,21.75,30.0,36.0,28.0,5.464286,...,16.925,19.0,28.0,1.571429,0.835711,1.0,1.0,1.0,2.0,3.0
78,36.0,24.061111,6.898044,16.2,19.35,20.7,28.0,43.1,36.0,5.361111,...,16.825,21.5,36.0,1.611111,0.837608,1.0,1.0,1.0,2.0,3.0
79,29.0,25.093103,6.794217,15.5,19.2,23.9,31.8,37.3,29.0,5.827586,...,17.3,24.8,29.0,1.275862,0.5914,1.0,1.0,1.0,1.0,3.0


In [148]:
numeric_columns = df.select_dtypes(include=[np.number]).columns

year_cyl = df.groupby(['model_year', 'cylinders'])[numeric_columns].mean()

In [149]:
year_cyl.index.levels

FrozenList([[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82], [3, 4, 5, 6, 8]])

In [150]:
year_cyl.loc[70]

Unnamed: 0_level_0,mpg,cylinders,displacement,weight,acceleration,model_year,origin
cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4,25.285714,4.0,107.0,2292.571429,16.0,70.0,2.285714
6,20.5,6.0,199.0,2710.5,15.5,70.0,1.0
8,14.111111,8.0,367.555556,3940.055556,11.194444,70.0,1.0


In [151]:
year_cyl.loc[(70, 4)]

mpg               25.285714
cylinders          4.000000
displacement     107.000000
weight          2292.571429
acceleration      16.000000
model_year        70.000000
origin             2.285714
Name: (70, 4), dtype: float64

# 33. GroupBy Operations - Part Two - MultiIndex

In [152]:
year_cyl.xs(key=70, level='model_year')

Unnamed: 0_level_0,mpg,cylinders,displacement,weight,acceleration,model_year,origin
cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4,25.285714,4.0,107.0,2292.571429,16.0,70.0,2.285714
6,20.5,6.0,199.0,2710.5,15.5,70.0,1.0
8,14.111111,8.0,367.555556,3940.055556,11.194444,70.0,1.0


In [153]:
year_cyl.loc[[70, 80]]

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,cylinders,displacement,weight,acceleration,model_year,origin
model_year,cylinders,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
70,4,25.285714,4.0,107.0,2292.571429,16.0,70.0,2.285714
70,6,20.5,6.0,199.0,2710.5,15.5,70.0,1.0
70,8,14.111111,8.0,367.555556,3940.055556,11.194444,70.0,1.0
80,3,23.7,3.0,70.0,2420.0,12.5,80.0,3.0
80,4,34.612,4.0,111.0,2360.08,17.144,80.0,2.2
80,5,36.4,5.0,121.0,2950.0,19.9,80.0,2.0
80,6,25.9,6.0,196.5,3145.5,15.05,80.0,2.0


In [154]:
year_cyl.xs(key=4, level='cylinders')

Unnamed: 0_level_0,mpg,cylinders,displacement,weight,acceleration,model_year,origin
model_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
70,25.285714,4.0,107.0,2292.571429,16.0,70.0,2.285714
71,27.461538,4.0,101.846154,2056.384615,16.961538,71.0,1.923077
72,23.428571,4.0,111.535714,2382.642857,17.214286,72.0,1.928571
73,22.727273,4.0,109.272727,2338.090909,17.136364,73.0,2.0
74,27.8,4.0,96.533333,2151.466667,16.4,74.0,2.2
75,25.25,4.0,114.833333,2489.25,15.833333,75.0,2.166667
76,26.766667,4.0,106.333333,2306.6,16.866667,76.0,1.866667
77,29.107143,4.0,106.5,2205.071429,16.064286,77.0,1.857143
78,29.576471,4.0,112.117647,2296.764706,16.282353,78.0,2.117647
79,31.525,4.0,113.583333,2357.583333,15.991667,79.0,1.583333


In [155]:
df[df['cylinders'].isin([4, 6])].groupby(['model_year', 'cylinders'])['mpg'].mean()

model_year  cylinders
70          4            25.285714
            6            20.500000
71          4            27.461538
            6            18.000000
72          4            23.428571
73          4            22.727273
            6            19.000000
74          4            27.800000
            6            17.857143
75          4            25.250000
            6            17.583333
76          4            26.766667
            6            20.000000
77          4            29.107143
            6            19.500000
78          4            29.576471
            6            19.066667
79          4            31.525000
            6            22.950000
80          4            34.612000
            6            25.900000
81          4            32.814286
            6            23.428571
82          4            32.071429
            6            28.333333
Name: mpg, dtype: float64

In [156]:
year_cyl.swaplevel()

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,cylinders,displacement,weight,acceleration,model_year,origin
cylinders,model_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4,70,25.285714,4.0,107.0,2292.571429,16.0,70.0,2.285714
6,70,20.5,6.0,199.0,2710.5,15.5,70.0,1.0
8,70,14.111111,8.0,367.555556,3940.055556,11.194444,70.0,1.0
4,71,27.461538,4.0,101.846154,2056.384615,16.961538,71.0,1.923077
6,71,18.0,6.0,243.375,3171.875,14.75,71.0,1.0
8,71,13.428571,8.0,371.714286,4537.714286,12.214286,71.0,1.0
3,72,19.0,3.0,70.0,2330.0,13.5,72.0,3.0
4,72,23.428571,4.0,111.535714,2382.642857,17.214286,72.0,1.928571
8,72,13.615385,8.0,344.846154,4228.384615,13.0,72.0,1.0
3,73,18.0,3.0,70.0,2124.0,13.5,73.0,3.0


In [157]:
year_cyl.sort_index(level='model_year', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,cylinders,displacement,weight,acceleration,model_year,origin
model_year,cylinders,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
82,6,28.333333,6.0,225.0,2931.666667,16.033333,82.0,1.0
82,4,32.071429,4.0,118.571429,2402.321429,16.703571,82.0,1.714286
81,8,26.6,8.0,350.0,3725.0,19.0,81.0,1.0
81,6,23.428571,6.0,184.0,3093.571429,15.442857,81.0,1.714286
81,4,32.814286,4.0,108.857143,2275.47619,16.466667,81.0,2.095238
80,6,25.9,6.0,196.5,3145.5,15.05,80.0,2.0
80,5,36.4,5.0,121.0,2950.0,19.9,80.0,2.0
80,4,34.612,4.0,111.0,2360.08,17.144,80.0,2.2
80,3,23.7,3.0,70.0,2420.0,12.5,80.0,3.0
79,8,18.63,8.0,321.4,3862.9,15.4,79.0,1.0


In [158]:
numeric_columns1 = df.select_dtypes(include=[np.number]).columns

df[numeric_columns1].agg(['mean', 'std'])

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model_year,origin
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.802055


In [159]:
df.agg({'mpg': ['max', 'mean'], 'weight': ['mean', 'std']})

Unnamed: 0,mpg,weight
max,46.6,
mean,23.514573,2970.424623
std,,846.841774


# 34. Combining DataFrames - Concatenation

In [160]:
data_one = {'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}
data_two = {'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}

In [161]:
one = pd.DataFrame(data_one)
two = pd.DataFrame(data_two)

In [162]:
pd.concat([one, two])

Unnamed: 0,A,B,C,D
0,A0,B0,,
1,A1,B1,,
2,A2,B2,,
3,A3,B3,,
0,,,C0,D0
1,,,C1,D1
2,,,C2,D2
3,,,C3,D3


In [163]:
pd.concat([one, two], axis=0)

Unnamed: 0,A,B,C,D
0,A0,B0,,
1,A1,B1,,
2,A2,B2,,
3,A3,B3,,
0,,,C0,D0
1,,,C1,D1
2,,,C2,D2
3,,,C3,D3


In [164]:
pd.concat([one, two], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [165]:
two.columns = one.columns # to make the columns the same

In [166]:
mdfy = pd.concat([one, two], axis=0)

In [167]:
mdfy.index = range(len(mdfy))

In [168]:
mdfy

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
4,C0,D0
5,C1,D1
6,C2,D2
7,C3,D3


# 35. Combining DataFrames - Inner Merge

In [169]:
registrations = pd.DataFrame({'reg_id': [1, 2, 3, 4], 'name': ['Andrew', 'Bob', 'Claire', 'David']})
logins = pd.DataFrame({'log_id': [1, 2, 3, 4], 'name': ['Xavier', 'Andrew', 'Yolanda', 'Bob']})

In [170]:
registrations

Unnamed: 0,reg_id,name
0,1,Andrew
1,2,Bob
2,3,Claire
3,4,David


In [171]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bob


In [172]:
# help(pd.merge)

In [173]:
pd.merge(registrations, logins, how='inner', on='name')

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2
1,2,Bob,4


In [174]:
pd.merge(logins, registrations, how='inner', on='name')

Unnamed: 0,log_id,name,reg_id
0,2,Andrew,1
1,4,Bob,2


# 36. Combining DataFrames - Left and Right Merge

In [175]:
pd.merge(registrations, logins, how='left', on='name')

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2.0
1,2,Bob,4.0
2,3,Claire,
3,4,David,


In [176]:
pd.merge(registrations, logins, how='right', on='name')

Unnamed: 0,reg_id,name,log_id
0,,Xavier,1
1,1.0,Andrew,2
2,,Yolanda,3
3,2.0,Bob,4


# 37. Combining DataFrames - Outer Merge

In [177]:
registrations = pd.DataFrame({'reg_id': [1, 2, 3, 4], 'name': ['Andrew', 'Bob', 'Claire', 'David']})
logins = pd.DataFrame({'log_id': [1, 2, 3, 4], 'name': ['Xavier', 'Andrew', 'Yolanda', 'Bob']})

In [178]:
pd.merge(registrations, logins, how='outer', on='name')

Unnamed: 0,reg_id,name,log_id
0,1.0,Andrew,2.0
1,2.0,Bob,4.0
2,3.0,Claire,
3,4.0,David,
4,,Xavier,1.0
5,,Yolanda,3.0


In [179]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bob


In [180]:
pd.merge(registrations, logins, left_index=True, right_index=True, how='inner')

Unnamed: 0,reg_id,name_x,log_id,name_y
0,1,Andrew,1,Xavier
1,2,Bob,2,Andrew
2,3,Claire,3,Yolanda
3,4,David,4,Bob


In [181]:
registrations = registrations.set_index('index')

KeyError: "None of ['index'] are in the columns"

In [182]:
registrations.columns = ['reg_id', 'reg_name']

In [183]:
logins.columns = [ 'log_id', 'name']

In [184]:
registrations

Unnamed: 0,reg_id,reg_name
0,1,Andrew
1,2,Bob
2,3,Claire
3,4,David


In [185]:
results = pd.merge(registrations, logins, how='inner', left_on='reg_name', right_on='name')

In [186]:
results

Unnamed: 0,reg_id,reg_name,log_id,name
0,1,Andrew,2,Andrew
1,2,Bob,4,Bob


In [187]:
results.drop('name', axis=1)

Unnamed: 0,reg_id,reg_name,log_id
0,1,Andrew,2
1,2,Bob,4


In [188]:
registrations

Unnamed: 0,reg_id,reg_name
0,1,Andrew
1,2,Bob
2,3,Claire
3,4,David


In [189]:
registrations.columns = ['id', 'name']

In [190]:
logins.columns = ['id', 'name']

In [191]:
registrations

Unnamed: 0,id,name
0,1,Andrew
1,2,Bob
2,3,Claire
3,4,David


In [192]:
logins

Unnamed: 0,id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bob


In [193]:
pd.merge(registrations, logins, how='inner', on='name', suffixes=('_reg', '_log'))

Unnamed: 0,id_reg,name,id_log
0,1,Andrew,2
1,2,Bob,4


# 38. Pandas - Text Methods for String Data

In [194]:
email = 'jose@email.com'

In [195]:
email.split('@')

['jose', 'email.com']

In [196]:
email.isdigit()

False

In [197]:
'5'.isdigit()

True

In [198]:
names = pd.Series(['andrew', 'bobo', 'claire', 'david', '5'])

In [199]:
names.str.upper()

0    ANDREW
1      BOBO
2    CLAIRE
3     DAVID
4         5
dtype: object

In [200]:
names.str.isdigit()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [201]:
tech_finance = ['GOOG, APPL, AMZN', 'JPM, BAC, GS']

In [202]:
len(tech_finance)

2

In [203]:
tickers = pd.Series(tech_finance)

In [204]:
tech = 'GOOG, APPL, AMZN'

In [205]:
tech.split(', ')[0]

'GOOG'

In [206]:
messy_names = pd.Series(['andrew  ', 'bo;bo', '   claire   '])

In [207]:
messy_names.str.replace(';', '').str.strip().str.capitalize()

0    Andrew
1      Bobo
2    Claire
dtype: object

In [208]:
def cleanup(name):
  name = name.replace(';', '')
  name = name.strip()
  name = name.capitalize()
  return name

In [209]:
messy_names.apply(cleanup)

0    Andrew
1      Bobo
2    Claire
dtype: object

In [210]:
import timeit

# code snippet to be executed only once 
setup = '''
import pandas as pd
import numpy as np
messy_names = pd.Series(["andrew  ","bo;bo","  claire  "])
def cleanup(name):
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    return name
'''

# code snippet whose execution time is to be measured 
stmt_pandas_str = ''' 
messy_names.str.replace(";","").str.strip().str.capitalize()
'''

stmt_pandas_apply = '''
messy_names.apply(cleanup)
'''

stmt_pandas_vectorize='''
np.vectorize(cleanup)(messy_names)
'''

In [211]:
timeit.timeit(setup = setup,
              stmt = stmt_pandas_str,
              number = 10000) 

1.8006501999989268

In [212]:
timeit.timeit(setup = setup,
              stmt = stmt_pandas_apply,
              number = 10000) 

0.2894263999987743

In [213]:
timeit.timeit(setup = setup,
              stmt = stmt_pandas_vectorize,
              number = 10000) 

0.19421719999809284

# 39. Pandas - Time Methods for Date and Time Data

In [214]:
myyear = 2015
mymonth = 1
myday = 1
myhour = 2
mymin = 30
mysec = 15

In [215]:
mydate = dt.datetime(myyear, mymonth, myday)

In [216]:
mydatetime = dt.datetime(myyear, mymonth, myday, myhour, mymin, mysec)

In [217]:
mydatetime.year

2015

In [218]:
myser = pd.Series(['Nov 3, 1990', '2000-01-01', None])

In [219]:
timeser = pd.to_datetime(myser, format="mixed")

In [220]:
timeser

0   1990-11-03
1   2000-01-01
2          NaT
dtype: datetime64[ns]

In [221]:
obvi_euro_date = '31-12-2000'

In [222]:
pd.to_datetime(obvi_euro_date, dayfirst=True)

Timestamp('2000-12-31 00:00:00')

In [223]:
euro_date = '10-12-2000'

In [224]:
pd.to_datetime(euro_date, dayfirst=True)

Timestamp('2000-12-10 00:00:00')

In [225]:
style_date = '12--Dec--2000'

In [226]:
pd.to_datetime(style_date, format='%d--%b--%Y')

Timestamp('2000-12-12 00:00:00')

In [227]:
custom_date = '12th of Dec 2000'

In [228]:
pd.to_datetime(custom_date, format='%dth of %b %Y')

Timestamp('2000-12-12 00:00:00')

In [229]:
sales = pd.read_csv(r'data_frame\RetailSales_BeerWineLiquor.csv')

In [230]:
sales['DATE'] = pd.to_datetime(sales['DATE'])

In [231]:
sales

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822
...,...,...
335,2019-12-01,6630
336,2020-01-01,4388
337,2020-02-01,4533
338,2020-03-01,5562


In [232]:
sales = pd.read_csv(r'data_frame\RetailSales_BeerWineLiquor.csv', parse_dates=['DATE'])

In [233]:
sales = sales.set_index('DATE')

In [234]:
sales.resample(rule='YE').mean()

Unnamed: 0_level_0,MRTSSM4453USN
DATE,Unnamed: 1_level_1
1992-12-31,1807.25
1993-12-31,1794.833333
1994-12-31,1841.75
1995-12-31,1833.916667
1996-12-31,1929.75
1997-12-31,2006.75
1998-12-31,2115.166667
1999-12-31,2206.333333
2000-12-31,2375.583333
2001-12-31,2468.416667


In [235]:
sales = pd.read_csv(r'data_frame\RetailSales_BeerWineLiquor.csv', parse_dates=['DATE'])

In [236]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           340 non-null    datetime64[ns]
 1   MRTSSM4453USN  340 non-null    int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 5.4 KB


In [237]:
sales['DATE'].dt.year

0      1992
1      1992
2      1992
3      1992
4      1992
       ... 
335    2019
336    2020
337    2020
338    2020
339    2020
Name: DATE, Length: 340, dtype: int32

# 40. Pandas Input and Output - CSV Files

In [238]:
os.getcwd()

'F:\\DataSpell\\data_science_ml_learning\\Section 5 Pandas'

In [239]:
df = pd.read_csv(r'data_frame\example.csv', index_col=0)

In [240]:
new = df.to_csv(r'data_frame\my_output.csv', index=False)

In [241]:
new

# 41. Pandas Input and Output - HTML Tables

In [242]:
url = 'https://en.wikipedia.org/wiki/World_population'

In [243]:
tables = pd.read_html(url)

In [248]:
len(tables)

30

In [250]:
tables[0]

Unnamed: 0,Population,1,2,3,4,5,6,7,8,9,10
0,Year,1804,1927,1960,1974,1987,1999,2011,2022,2037,2057
1,Years elapsed,"200,000+",123,33,14,13,12,12,11,15,20


In [253]:
tables[1]

Unnamed: 0,Region,2022 (percent),2030 (percent),2050 (percent)
0,Sub-Saharan Africa,"1,152 (14.51%)","1,401 (16.46%)","2,094 (21.62%)"
1,Northern Africa and Western Asia,549 (6.91%),617 (7.25%),771 (7.96%)
2,Central Asia and Southern Asia,"2,075 (26.13%)","2,248 (26.41%)","2,575 (26.58%)"
3,Eastern Asia and Southeastern Asia,"2,342 (29.49%)","2,372 (27.87%)","2,317 (23.92%)"
4,Europe and Northern America,"1,120 (14.10%)","1,129 (13.26%)","1,125 (11.61%)"
5,Latin America and the Caribbean,658 (8.29%),695 (8.17%),749 (7.73%)
6,Australia and New Zealand,31 (0.39%),34 (0.40%),38 (0.39%)
7,Oceania,14 (0.18%),15 (0.18%),20 (0.21%)
8,World,7942,8512,9687


In [254]:
world_topten = tables[1]

In [256]:
world_topten

Unnamed: 0,Region,2022 (percent),2030 (percent),2050 (percent)
0,Sub-Saharan Africa,"1,152 (14.51%)","1,401 (16.46%)","2,094 (21.62%)"
1,Northern Africa and Western Asia,549 (6.91%),617 (7.25%),771 (7.96%)
2,Central Asia and Southern Asia,"2,075 (26.13%)","2,248 (26.41%)","2,575 (26.58%)"
3,Eastern Asia and Southeastern Asia,"2,342 (29.49%)","2,372 (27.87%)","2,317 (23.92%)"
4,Europe and Northern America,"1,120 (14.10%)","1,129 (13.26%)","1,125 (11.61%)"
5,Latin America and the Caribbean,658 (8.29%),695 (8.17%),749 (7.73%)
6,Australia and New Zealand,31 (0.39%),34 (0.40%),38 (0.39%)
7,Oceania,14 (0.18%),15 (0.18%),20 (0.21%)
8,World,7942,8512,9687


# 42. Pandas Input and Output - Excel Files

In [260]:
df = pd.read_excel(r'data_frame\my_excel_file.xlsx', sheet_name='First_Sheet')

In [262]:
wb = pd.ExcelFile(r'data_frame\my_excel_file.xlsx')

In [264]:
wb.sheet_names

['First_Sheet']

In [265]:
excel_sheet_dict = pd.read_excel(r'data_frame\my_excel_file.xlsx', sheet_name=None)

In [268]:
excel_sheet_dict

{'First_Sheet':     a   b   c   d
 0   0   1   2   3
 1   4   5   6   7
 2   8   9  10  11
 3  12  13  14  15}

In [269]:
our_df = excel_sheet_dict['First_Sheet']

In [270]:
our_df.to_excel(r'data_frame\example_excel.xlsx', sheet_name='First_Sheet', index=False)

# 43. Pandas Input and Output - SQL Databases

In [274]:
temp_db = create_engine('sqlite:///:memory:')

In [275]:
pd.DataFrame(data=np.random.randint(low=0, high=100, size=(4, 4)), columns=['a', 'b', 'c', 'd']).to_sql(name='new_table', con=temp_db)

4

In [276]:
df.to_sql(name='new_table', con=temp_db)

ValueError: Table 'new_table' already exists.

In [281]:
new_dt = pd.read_sql(sql='new_table', con=temp_db)

In [283]:
new_dt

Unnamed: 0,index,a,b,c,d
0,0,42,66,9,5
1,1,26,66,22,47
2,2,27,46,14,52
3,3,74,54,3,94


In [285]:
result = pd.read_sql_query(sql='SELECT a, c FROM new_table', con=temp_db)

In [287]:
result

Unnamed: 0,a,c
0,42,9
1,26,22
2,27,14
3,74,3


# 44. Pandas Pivot Tables

In [289]:
df = pd.read_csv(r'data_frame\Sales_Funnel_CRM.csv')

In [296]:
df.pivot_table(index='Company', columns='Product', values='Licenses', aggfunc='sum')

Product,Analytics,GPS Positioning,Prediction,Tracking
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Google,150.0,,150.0,300.0
ATT,,,150.0,150.0
Apple,300.0,,,
BOBO,150.0,,,
CVS Health,,,,450.0
Cisco,300.0,300.0,,
Exxon Mobile,150.0,,,
IKEA,300.0,,,
Microsoft,,,,300.0
Salesforce,750.0,,,


In [298]:
pd.pivot_table(df, index='Company', aggfunc='sum', values=['Licenses', 'Units'])

Unnamed: 0_level_0,Account Manager,Account Number,Contact,Licenses,Product,Sale Price,Status
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Google,Edward ThorpEdward ThorpEdward Thorp,6370194,Larry PagerLarry PagerLarry Pager,600,AnalyticsPredictionTracking,3150000,PresentedPresentedUnder Review
ATT,Claude ShannonClaude Shannon,1396064,Cindy PhonerCindy Phoner,300,TrackingPrediction,1050000,Under ReviewPresented
Apple,Claude Shannon,405886,Cindy Phoner,300,Analytics,4550000,Won
BOBO,Edward Thorp,2192650,Larry Pager,150,Analytics,2450000,Lost
CVS Health,Claude Shannon,902797,Emma Gordian,450,Tracking,490000,Won
Cisco,Claude ShannonClaude Shannon,4338998,Emma GordianEmma Gordian,600,AnalyticsGPS Positioning,4900000,LostPresented
Exxon Mobile,Claude Shannon,470248,Cindy Phoner,150,Analytics,2100000,Presented
IKEA,Edward Thorp,420496,Elon Tusk,300,Analytics,4550000,Won
Microsoft,Edward Thorp,1216870,Will Grates,300,Tracking,350000,Under Review
Salesforce,Claude Shannon,2046943,Emma Gordian,750,Analytics,7000000,Won


In [300]:
df.groupby('Company').sum()

Unnamed: 0_level_0,Account Number,Contact,Account Manager,Product,Licenses,Sale Price,Status
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Google,6370194,Larry PagerLarry PagerLarry Pager,Edward ThorpEdward ThorpEdward Thorp,AnalyticsPredictionTracking,600,3150000,PresentedPresentedUnder Review
ATT,1396064,Cindy PhonerCindy Phoner,Claude ShannonClaude Shannon,TrackingPrediction,300,1050000,Under ReviewPresented
Apple,405886,Cindy Phoner,Claude Shannon,Analytics,300,4550000,Won
BOBO,2192650,Larry Pager,Edward Thorp,Analytics,150,2450000,Lost
CVS Health,902797,Emma Gordian,Claude Shannon,Tracking,450,490000,Won
Cisco,4338998,Emma GordianEmma Gordian,Claude ShannonClaude Shannon,AnalyticsGPS Positioning,600,4900000,LostPresented
Exxon Mobile,470248,Cindy Phoner,Claude Shannon,Analytics,150,2100000,Presented
IKEA,420496,Elon Tusk,Edward Thorp,Analytics,300,4550000,Won
Microsoft,1216870,Will Grates,Edward Thorp,Tracking,300,350000,Under Review
Salesforce,2046943,Emma Gordian,Claude Shannon,Analytics,750,7000000,Won


In [307]:
pd.pivot_table(df, index=['Account Manager', 'Contact'], values=['Sale Price'], columns=['Product'], aggfunc='sum', 
               fill_value=0, margins=True, margins_name='Grand Total')

Unnamed: 0_level_0,Unnamed: 1_level_0,Sale Price,Sale Price,Sale Price,Sale Price,Sale Price
Unnamed: 0_level_1,Product,Analytics,GPS Positioning,Prediction,Tracking,Grand Total
Account Manager,Contact,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Claude Shannon,Cindy Phoner,6650000,0,700000,350000,7700000
Claude Shannon,Emma Gordian,11550000,350000,0,490000,12390000
Edward Thorp,Elon Tusk,7350000,0,700000,0,8050000
Edward Thorp,Larry Pager,4550000,0,700000,350000,5600000
Edward Thorp,Will Grates,2450000,0,0,350000,2800000
Grand Total,,32550000,350000,2100000,1540000,36540000


# 45. Pandas Project Exercise Overview