In [5]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv") # load a sample data set #
print (df.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


# EXPLORING THE DATA 

In [11]:
# Get the number or rows and columns
print( 'shape' , df.shape)

shape (244, 7)


In [13]:
# show data types , missing values , and memory usage
print (df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB
None


# Filtering and Sorting the Data

In [18]:
# Show all the rows where total bill is greater than 20
filtered_df = df[df['total_bill'] > 20]
print (filtered_df)

     total_bill   tip     sex smoker  day    time  size
2         21.01  3.50    Male     No  Sun  Dinner     3
3         23.68  3.31    Male     No  Sun  Dinner     2
4         24.59  3.61  Female     No  Sun  Dinner     4
5         25.29  4.71    Male     No  Sun  Dinner     4
7         26.88  3.12    Male     No  Sun  Dinner     4
..          ...   ...     ...    ...  ...     ...   ...
237       32.83  1.17    Male    Yes  Sat  Dinner     2
238       35.83  4.67  Female     No  Sat  Dinner     3
239       29.03  5.92    Male     No  Sat  Dinner     3
240       27.18  2.00  Female    Yes  Sat  Dinner     2
241       22.67  2.00    Male    Yes  Sat  Dinner     2

[97 rows x 7 columns]


In [20]:
# show all the rows where tip is greater than 5 
filtered_dff = df[df[ 'tip'] > 5 ]
print (filtered_dff)

     total_bill    tip     sex smoker   day    time  size
23        39.42   7.58    Male     No   Sat  Dinner     4
44        30.40   5.60    Male     No   Sun  Dinner     4
47        32.40   6.00    Male     No   Sun  Dinner     4
52        34.81   5.20  Female     No   Sun  Dinner     4
59        48.27   6.73    Male     No   Sat  Dinner     4
85        34.83   5.17  Female     No  Thur   Lunch     4
88        24.71   5.85    Male     No  Thur   Lunch     2
116       29.93   5.07    Male     No   Sun  Dinner     4
141       34.30   6.70    Male     No  Thur   Lunch     6
155       29.85   5.14  Female     No   Sun  Dinner     5
170       50.81  10.00    Male    Yes   Sat  Dinner     3
172        7.25   5.15    Male    Yes   Sun  Dinner     2
181       23.33   5.65    Male    Yes   Sun  Dinner     2
183       23.17   6.50    Male    Yes   Sun  Dinner     4
211       25.89   5.16    Male    Yes   Sat  Dinner     4
212       48.33   9.00    Male     No   Sat  Dinner     4
214       28.1

In [40]:
# sort the data by tip in ascending order
sorted_df = df.sort_values( by='tip',ascending=True)
print(sorted_df)

     total_bill    tip     sex smoker   day    time  size
67         3.07   1.00  Female    Yes   Sat  Dinner     1
236       12.60   1.00    Male    Yes   Sat  Dinner     2
92         5.75   1.00  Female    Yes   Fri  Dinner     2
111        7.25   1.00  Female     No   Sat  Dinner     1
0         16.99   1.01  Female     No   Sun  Dinner     2
..          ...    ...     ...    ...   ...     ...   ...
141       34.30   6.70    Male     No  Thur   Lunch     6
59        48.27   6.73    Male     No   Sat  Dinner     4
23        39.42   7.58    Male     No   Sat  Dinner     4
212       48.33   9.00    Male     No   Sat  Dinner     4
170       50.81  10.00    Male    Yes   Sat  Dinner     3

[244 rows x 7 columns]


# Grouping and Aggregation

In [53]:
# Average tip by gender
print(df.groupby('sex')['tip'].mean())

sex
Female    2.833448
Male      3.089618
Name: tip, dtype: float64


In [55]:
# calculate total bill which is colleted on each day
total_bill_by_day = df.groupby('day')['total_bill'].sum()
print(total_bill_by_day)

day
Fri      325.88
Sat     1778.40
Sun     1627.16
Thur    1096.33
Name: total_bill, dtype: float64


# Checking for Missing Data

In [58]:
missing_values = df.isnull().sum()
print(" Missing Values ", missing_values)

 Missing Values  total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64


# Create New Columns

In [85]:
# create new column for the tip percent
df['tip_percent'] = (df['tip'] / df['total_bill']) * 100
print(df[['total_bill', 'tip', 'tip_percent']].head())

                     

   total_bill   tip  tip_percent
0       16.99  1.01     5.944673
1       10.34  1.66    16.054159
2       21.01  3.50    16.658734
3       23.68  3.31    13.978041
4       24.59  3.61    14.680765


# Saving the Clean / Modified Data

In [90]:
df.to_csv("clean tips data.csv" , index = False)