# Lab task-3(AP21110010057)

In [12]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler


In [20]:
tips_df = pd.read_csv('tips.csv')
print("Original DataFrame:")
print(tips_df.head())

Original DataFrame:
   Unnamed: 0  total_bill   tip     sex smoker  day    time  size
0           0       16.99  1.01  Female     No  Sun  Dinner     2
1           1         NaN  1.66    Male     No  Sun  Dinner     3
2           2       21.01  3.50    Male     No  Sun  Dinner     3
3           3       23.68  3.31    Male     No  Sun  Dinner     2
4           4       24.59  3.61  Female     No  Sun  Dinner     4


1. Read the tips dataset from the provided source only. Handle the missing values with the appropriate techniques.

In [21]:
imputer = SimpleImputer(strategy='mean')
tips_df[['total_bill', 'tip', 'size']] = imputer.fit_transform(tips_df[['total_bill', 'tip', 'size']])
print("\nDataFrame after handling missing values:")
print(tips_df.head())



DataFrame after handling missing values:
   Unnamed: 0  total_bill   tip     sex smoker  day    time  size
0           0   16.990000  1.01  Female     No  Sun  Dinner   2.0
1           1   21.360195  1.66    Male     No  Sun  Dinner   3.0
2           2   21.010000  3.50    Male     No  Sun  Dinner   3.0
3           3   23.680000  3.31    Male     No  Sun  Dinner   2.0
4           4   24.590000  3.61  Female     No  Sun  Dinner   4.0


2. Handle the categorical data in the tips dataset with the relevant approaches such as label-encoding, one hot encoding, ordinal encoding.

In [22]:
label_encoder = LabelEncoder()
tips_df['sex_label'] = label_encoder.fit_transform(tips_df['sex'])

one_hot_encoder = pd.get_dummies(tips_df['day'], prefix='day')
tips_df = pd.concat([tips_df, one_hot_encoder], axis=1)

ordinal_encoder = OrdinalEncoder(categories=[['Thur', 'Fri', 'Sat', 'Sun']])
tips_df['day_ordinal'] = ordinal_encoder.fit_transform(tips_df[['day']])
print("\nDataFrame after handling categorical data:")
print(tips_df.head())




DataFrame after handling categorical data:
   Unnamed: 0  total_bill   tip     sex smoker  day    time  size  sex_label  \
0           0   16.990000  1.01  Female     No  Sun  Dinner   2.0          0   
1           1   21.360195  1.66    Male     No  Sun  Dinner   3.0          1   
2           2   21.010000  3.50    Male     No  Sun  Dinner   3.0          1   
3           3   23.680000  3.31    Male     No  Sun  Dinner   2.0          1   
4           4   24.590000  3.61  Female     No  Sun  Dinner   4.0          0   

   day_Fri  day_Sat  day_Sun  day_Thur  day_ordinal  
0    False    False     True     False          3.0  
1    False    False     True     False          3.0  
2    False    False     True     False          3.0  
3    False    False     True     False          3.0  
4    False    False     True     False          3.0  


3. Perform feature scaling techniques such as min-max normalization, standardization, z-score, on the tips dataset.

In [23]:
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

tips_df[['total_bill_minmax', 'tip_minmax']] = min_max_scaler.fit_transform(tips_df[['total_bill', 'tip']])
tips_df[['total_bill_standard', 'tip_standard']] = standard_scaler.fit_transform(tips_df[['total_bill', 'tip']])
print("\nDataFrame after feature scaling:")
print(tips_df.head())


DataFrame after feature scaling:
   Unnamed: 0  total_bill   tip     sex smoker  day    time  size  sex_label  \
0           0   16.990000  1.01  Female     No  Sun  Dinner   2.0          0   
1           1   21.360195  1.66    Male     No  Sun  Dinner   3.0          1   
2           2   21.010000  3.50    Male     No  Sun  Dinner   3.0          1   
3           3   23.680000  3.31    Male     No  Sun  Dinner   2.0          1   
4           4   24.590000  3.61  Female     No  Sun  Dinner   4.0          0   

   day_Fri  day_Sat  day_Sun  day_Thur  day_ordinal  total_bill_minmax  \
0    False    False     True     False          3.0           0.291579   
1    False    False     True     False          3.0           0.383121   
2    False    False     True     False          3.0           0.375786   
3    False    False     True     False          3.0           0.431713   
4    False    False     True     False          3.0           0.450775   

   tip_minmax  total_bill_standard  tip_

4. Create a new feature representing the average tip percentage for each dining party size.

In [24]:
tips_df['tip_percentage'] = (tips_df['tip'] / tips_df['total_bill']) * 100
average_tip_percentage = tips_df.groupby('size')['tip_percentage'].mean().reset_index()
average_tip_percentage.columns = ['size', 'average_tip_percentage']
tips_df = pd.merge(tips_df, average_tip_percentage, on='size', how='left')
print("\nDataFrame with the new feature (average tip percentage for each dining party size):")
print(tips_df.head())



DataFrame with the new feature (average tip percentage for each dining party size):
   Unnamed: 0  total_bill   tip     sex smoker  day    time  size  sex_label  \
0           0   16.990000  1.01  Female     No  Sun  Dinner   2.0          0   
1           1   21.360195  1.66    Male     No  Sun  Dinner   3.0          1   
2           2   21.010000  3.50    Male     No  Sun  Dinner   3.0          1   
3           3   23.680000  3.31    Male     No  Sun  Dinner   2.0          1   
4           4   24.590000  3.61  Female     No  Sun  Dinner   4.0          0   

   day_Fri  day_Sat  day_Sun  day_Thur  day_ordinal  total_bill_minmax  \
0    False    False     True     False          3.0           0.291579   
1    False    False     True     False          3.0           0.383121   
2    False    False     True     False          3.0           0.375786   
3    False    False     True     False          3.0           0.431713   
4    False    False     True     False          3.0           0.

5. Create a new feature based on total bill and tips if total bill is greater than 10 dollar and tips is greater than 3 dollar mark as Highest-bills-with-tips otherwise Normal-bills

In [25]:
tips_df['bill_tip_category'] = 'Normal-bills'
tips_df.loc[(tips_df['total_bill'] > 10) & (tips_df['tip'] > 3), 'bill_tip_category'] = 'Highest-bills-with-tips'
print("\nDataFrame with the new feature based on total bill and tips:")
print(tips_df.head())


DataFrame with the new feature based on total bill and tips:
   Unnamed: 0  total_bill   tip     sex smoker  day    time  size  sex_label  \
0           0   16.990000  1.01  Female     No  Sun  Dinner   2.0          0   
1           1   21.360195  1.66    Male     No  Sun  Dinner   3.0          1   
2           2   21.010000  3.50    Male     No  Sun  Dinner   3.0          1   
3           3   23.680000  3.31    Male     No  Sun  Dinner   2.0          1   
4           4   24.590000  3.61  Female     No  Sun  Dinner   4.0          0   

   day_Fri  ...  day_Sun  day_Thur  day_ordinal  total_bill_minmax  \
0    False  ...     True     False          3.0           0.291579   
1    False  ...     True     False          3.0           0.383121   
2    False  ...     True     False          3.0           0.375786   
3    False  ...     True     False          3.0           0.431713   
4    False  ...     True     False          3.0           0.450775   

   tip_minmax  total_bill_standard  

In [26]:
tips_df.head()

Unnamed: 0.1,Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_label,day_Fri,...,day_Sun,day_Thur,day_ordinal,total_bill_minmax,tip_minmax,total_bill_standard,tip_standard,tip_percentage,average_tip_percentage,bill_tip_category
0,0,16.99,1.01,Female,No,Sun,Dinner,2.0,0,False,...,True,False,3.0,0.291579,0.001111,-0.538972,-1.439947,5.944673,14.824175,Normal-bills
1,1,21.360195,1.66,Male,No,Sun,Dinner,3.0,1,False,...,True,False,3.0,0.383121,0.073333,0.0,-0.969205,7.771465,14.77803,Normal-bills
2,2,21.01,3.5,Male,No,Sun,Dinner,3.0,1,False,...,True,False,3.0,0.375786,0.277778,-0.043189,0.363356,16.658734,14.77803,Highest-bills-with-tips
3,3,23.68,3.31,Male,No,Sun,Dinner,2.0,1,False,...,True,False,3.0,0.431713,0.256667,0.286099,0.225754,13.978041,14.824175,Highest-bills-with-tips
4,4,24.59,3.61,Female,No,Sun,Dinner,4.0,0,False,...,True,False,3.0,0.450775,0.29,0.398329,0.44302,14.680765,14.594901,Highest-bills-with-tips
