In [56]:
import pandas as pd

In [57]:
# Read the csv file
df = pd.read_csv('tips.csv')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [58]:
# display first 5 rows of the data
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [59]:
# display last 5 rows of the data
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [60]:
# generate 3 random data from the table

df.sample(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
56,38.01,3.0,Male,Yes,Sat,Dinner,4
152,17.26,2.74,Male,No,Sun,Dinner,3
125,29.8,4.2,Female,No,Thur,Lunch,6


* Every time you run df.sample() you will get new samples; if we want to get one fixed sample we need seed
* In pandas seed means random_state
* We should set random_state >= 0

In [61]:
df.sample(3, random_state=2023)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
154,19.77,2.0,Male,No,Sun,Dinner,4
4,24.59,3.61,Female,No,Sun,Dinner,4
30,9.55,1.45,Male,No,Sat,Dinner,2


# Data Profiling and Inspection

* Data profiling

data profiling is about understanding the characteristics of your dataset—like what types of values are there, how many missing values, or the range of numbers.

* Inspection 

data inspection involves looking at individual rows, columns, or specific data points to see if everything is as expected.


In [62]:
# display information about the table
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


* df.info() shows 244 not null in each column so, we don't have missing data.

In [63]:
# this is the datatypes of the df object
type(df)

pandas.core.frame.DataFrame

In [64]:

df.shape  # (no.of_rows,no.of_columns)

(244, 7)

In [65]:
# what are the column names
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [66]:
# datatypes of each column

df.dtypes


total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

In [67]:
# how to verify nun or not?
df.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [68]:
# Descriptive statistics of the data (mean,median,mode,Quartile,SD)
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


For total_bill column:

* count = 244 : There are 244 not null rows in total_bill column
* mean = 19.78 : The average total_bill is 19.78
* std = 8.90 : The standard deviation of total bill is 8.90
* min = 3.07 : The minimum total_bill is 3.07
* Q1 = 25% of the data points lies below 13.34 for tatal_bill column
* Q2 = 50% of the data points lies below 17.79 for tatal_bill column
* Q3= 75% of the data points lies below 24.12 for tatal_bill column
* The maximum total_bill is 50.81

In [69]:
# I want to describe objects type too. so we set the include ='object'
df.describe(include='object')

Unnamed: 0,sex,smoker,day,time
count,244,244,244,244
unique,2,2,4,2
top,Male,No,Sat,Dinner
freq,157,151,87,176


* there are 244 not null rows in sex column
* the sex columns contains two unique values
* the most occuring (top) values is male with 157 times/frequemcy

In [70]:
# disply the unique values
df['sex'].unique()

array(['Female', 'Male'], dtype=object)

In [71]:
# find the no.of unique values
df['sex'].nunique()

2

In [72]:
# count the how many male or female are there
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

# find the how many people are present smoker or not?

In [73]:
# length of the smoker
len(df['smoker'])

244

In [74]:
# find the unique value
df['smoker'].unique()

array(['No', 'Yes'], dtype=object)

In [75]:
# count the how many smoker or not present
df['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

* smoker = 93 are present
* Non_smoker = 151 are present

# find on average who tips more smoker or non smoker ?

In [76]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [77]:

df['smoker'] == 'Yes' # check the no.of smoker

0      False
1      False
2      False
3      False
4      False
       ...  
239    False
240     True
241     True
242    False
243    False
Name: smoker, Length: 244, dtype: bool

In [78]:
df['smoker']=='No' # check the no.of non smoker

0       True
1       True
2       True
3       True
4       True
       ...  
239     True
240    False
241    False
242     True
243     True
Name: smoker, Length: 244, dtype: bool

`df[df['smoker']== 'Yes'] vs df.loc[df['smoker']== 'Yes']`

* df[df['smoker']== 'Yes'] : If you only need to filter rows based on a condition.
* df.loc[df['smoker']== 'Yes'] :  If you want to be more explicit, especially when dealing with both row and column selections



In [79]:
df[df['smoker']== 'Yes'] # find the no.of smoker

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
56,38.01,3.00,Male,Yes,Sat,Dinner,4
58,11.24,1.76,Male,Yes,Sat,Dinner,2
60,20.29,3.21,Male,Yes,Sat,Dinner,2
61,13.81,2.00,Male,Yes,Sat,Dinner,2
62,11.02,1.98,Male,Yes,Sat,Dinner,2
...,...,...,...,...,...,...,...
234,15.53,3.00,Male,Yes,Sat,Dinner,2
236,12.60,1.00,Male,Yes,Sat,Dinner,2
237,32.83,1.17,Male,Yes,Sat,Dinner,2
240,27.18,2.00,Female,Yes,Sat,Dinner,2


In [80]:
df.loc[df['smoker'] == 'Yes']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
56,38.01,3.00,Male,Yes,Sat,Dinner,4
58,11.24,1.76,Male,Yes,Sat,Dinner,2
60,20.29,3.21,Male,Yes,Sat,Dinner,2
61,13.81,2.00,Male,Yes,Sat,Dinner,2
62,11.02,1.98,Male,Yes,Sat,Dinner,2
...,...,...,...,...,...,...,...
234,15.53,3.00,Male,Yes,Sat,Dinner,2
236,12.60,1.00,Male,Yes,Sat,Dinner,2
237,32.83,1.17,Male,Yes,Sat,Dinner,2
240,27.18,2.00,Female,Yes,Sat,Dinner,2


In [81]:
# calculate the tip mean of smoker
df.loc[df['smoker']== 'Yes'].tip.mean()

3.008709677419355

In [82]:
# calculate the tip mean of non smoker
df.loc[df['smoker']== 'No'].tip.mean()

2.9918543046357615

* No.of smoker gives more tips 3.008
* No.of non smoker gives less tips 2.99 than smoker

# find on average in which day you will get more tips?

In [83]:
# first we check the unique values of the day
df['day'].unique()

array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

In [84]:
df.loc[df['day'] == 'Sun'].tip.mean()

3.2551315789473687

In [85]:
df.loc[df['day'] == 'Sat'].tip.mean()

2.993103448275862

In [86]:
df.loc[df['day'] == 'Sat'].tip.mean()

2.993103448275862

In [87]:
df.loc[df['day'] == 'Fri'].tip.mean()

2.7347368421052627

* In sunday give the more no.of tips 3.25 .

*  By using the loc funtion to find the every values mean is to complex so it is not possible.

# loc vs groupby

* loc :

  when you want to select specific rows and columns based on labels or conditions 
  


* groupby :

when you want to group data based on some criteria and perform operations within each group.
## Example:
grouped_data = df.groupby('column_name').mean()



In [88]:
# By using groupby we can find the tip mean for a day column in one line
df.groupby(['day']).tip.mean() 

day
Fri     2.734737
Sat     2.993103
Sun     3.255132
Thur    2.771452
Name: tip, dtype: float64

In [89]:
df.groupby(['time']).tip.mean()

time
Dinner    3.102670
Lunch     2.728088
Name: tip, dtype: float64

In [90]:
df.groupby(['size']).tip.mean()

size
1    1.437500
2    2.582308
3    3.393158
4    4.135405
5    4.028000
6    5.225000
Name: tip, dtype: float64

# find on average give more tip female smoker or male ?

In [91]:
# We can use to find the multiple column statistical data using groupby
df.groupby(['sex','smoker']).tip.mean()

sex     smoker
Female  No        2.773519
        Yes       2.931515
Male    No        3.113402
        Yes       3.051167
Name: tip, dtype: float64

* on average female smoker gives more tips
* on average male non smoker gives more tips

# loc vs iloc

In [92]:
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


In [93]:
df[12:14, ['total_bill', 'tip']]

InvalidIndexError: (slice(12, 14, None), ['total_bill', 'tip'])

* loc 
 
 In loc we can indexing the row but we can't indexing column i.e we define the column_name
synax:
  
  df.loc[row_index, column_name]

In [94]:
df.loc[12:14, ['total_bill','tip']]

Unnamed: 0,total_bill,tip
12,15.42,1.57
13,18.43,3.0
14,14.83,3.02


* 14 row pani aayo? loc vaneko location ho index hoina; loc le last ko value linxa

In [95]:

df.loc[12:14, 0:2] 

TypeError: cannot do slice indexing on Index with these indexers [0] of type int

* loc ma column ko thau ma index dinu paudaina column name nai dinu parxa in list
* 0:2 klekhna paudaina loc expects column name estends not a index

# iloc
 
 In iloc we can indexing the row but as well as indexing column it is used when we don't known the column name

synax:

df.iloc[row_index, column_index]


In [99]:
# access 13 or 14 row and first 2 column
df.iloc[12:14, :2]

Unnamed: 0,total_bill,tip
12,15.42,1.57
13,18.43,3.0


In [100]:
# find last 3 column and first 5 rows
df.iloc[:5,-3:]

Unnamed: 0,day,time,size
0,Sun,Dinner,2
1,Sun,Dinner,3
2,Sun,Dinner,3
3,Sun,Dinner,2
4,Sun,Dinner,4
