### Loading the standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load the icecream data

In [3]:
icecream = pd.read_csv('Ice Cream Ratings.csv')
icecream.head()

Unnamed: 0,Date,Flavor Rating,Texture Rating,Overall Rating
0,1/1/2022,0.22309,0.04022,0.600129
1,1/2/2022,0.635886,0.938476,0.106264
2,1/3/2022,0.442323,0.044154,0.598112
3,1/4/2022,0.389128,0.549676,0.489353
4,1/5/2022,0.386887,0.519439,0.98828


In [4]:
icecream.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            7 non-null      object 
 1   Flavor Rating   7 non-null      float64
 2   Texture Rating  7 non-null      float64
 3   Overall Rating  7 non-null      float64
dtypes: float64(3), object(1)
memory usage: 352.0+ bytes


#### Note : The info() output displays Date column data type as object, but Date should be date data type

In [5]:
icecream = pd.read_csv('Ice Cream Ratings.csv', parse_dates = ['Date'])
icecream.head()

Unnamed: 0,Date,Flavor Rating,Texture Rating,Overall Rating
0,2022-01-01,0.22309,0.04022,0.600129
1,2022-01-02,0.635886,0.938476,0.106264
2,2022-01-03,0.442323,0.044154,0.598112
3,2022-01-04,0.389128,0.549676,0.489353
4,2022-01-05,0.386887,0.519439,0.98828


In [6]:
icecream.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            7 non-null      datetime64[ns]
 1   Flavor Rating   7 non-null      float64       
 2   Texture Rating  7 non-null      float64       
 3   Overall Rating  7 non-null      float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 352.0 bytes


#### parse_dates = ['name of the date column'] will convert any date related columns to datetime64 data type

#### Set the date column as index always when ever there is a date column present in the dataset

In [7]:
icecream.set_index('Date')

Unnamed: 0_level_0,Flavor Rating,Texture Rating,Overall Rating
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,0.22309,0.04022,0.600129
2022-01-02,0.635886,0.938476,0.106264
2022-01-03,0.442323,0.044154,0.598112
2022-01-04,0.389128,0.549676,0.489353
2022-01-05,0.386887,0.519439,0.98828
2022-01-06,0.877984,0.193588,0.832827
2022-01-07,0.140995,0.32511,0.105147


### Merging and Joining

In [8]:
data1 = pd.read_csv('LOTR.csv')
data1

Unnamed: 0,FellowshipID,FirstName,Skills
0,1001,Frodo,Hiding
1,1002,Samwise,Gardening
2,1003,Gandalf,Spells
3,1004,Pippin,Fireworks


In [9]:
data2 = pd.read_csv('LOTR 2.csv')
data2

Unnamed: 0,FellowshipID,FirstName,Age
0,1001,Frodo,50
1,1002,Samwise,39
2,1006,Legolas,2931
3,1007,Elrond,6520
4,1008,Barromir,51


### Merge data1 and data2 

In [11]:
pd.merge(data1, data2)

Unnamed: 0,FellowshipID,FirstName,Skills,Age
0,1001,Frodo,Hiding,50
1,1002,Samwise,Gardening,39


### Merging 2 datasets

- You need 3 things inorder to merge 2 dataset
1. 2 different datasets
2. Atleast One Column should be common
3. Type of Merge being performing   
    a. Only 4 types of Join available  
         i. Inner(default)  
         ii. Outer  
         iii. Right  
         iv. Left  

In [12]:
pd.merge(data1, data2, on = 'FellowshipID', how = 'inner')

Unnamed: 0,FellowshipID,FirstName_x,Skills,FirstName_y,Age
0,1001,Frodo,Hiding,Frodo,50
1,1002,Samwise,Gardening,Samwise,39


In [13]:
pd.merge(data1, data2, on = 'FellowshipID', how = 'outer')

Unnamed: 0,FellowshipID,FirstName_x,Skills,FirstName_y,Age
0,1001,Frodo,Hiding,Frodo,50.0
1,1002,Samwise,Gardening,Samwise,39.0
2,1003,Gandalf,Spells,,
3,1004,Pippin,Fireworks,,
4,1006,,,Legolas,2931.0
5,1007,,,Elrond,6520.0
6,1008,,,Barromir,51.0


In [14]:
data1

Unnamed: 0,FellowshipID,FirstName,Skills
0,1001,Frodo,Hiding
1,1002,Samwise,Gardening
2,1003,Gandalf,Spells
3,1004,Pippin,Fireworks


In [15]:
data2

Unnamed: 0,FellowshipID,FirstName,Age
0,1001,Frodo,50
1,1002,Samwise,39
2,1006,Legolas,2931
3,1007,Elrond,6520
4,1008,Barromir,51


In [16]:
pd.merge(data1, data2, on = 'FellowshipID', how = 'left')

Unnamed: 0,FellowshipID,FirstName_x,Skills,FirstName_y,Age
0,1001,Frodo,Hiding,Frodo,50.0
1,1002,Samwise,Gardening,Samwise,39.0
2,1003,Gandalf,Spells,,
3,1004,Pippin,Fireworks,,


### Left Join : Data1 is display as it is and common rows from data2 are displayed

In [17]:
data1

Unnamed: 0,FellowshipID,FirstName,Skills
0,1001,Frodo,Hiding
1,1002,Samwise,Gardening
2,1003,Gandalf,Spells
3,1004,Pippin,Fireworks


In [18]:
data2

Unnamed: 0,FellowshipID,FirstName,Age
0,1001,Frodo,50
1,1002,Samwise,39
2,1006,Legolas,2931
3,1007,Elrond,6520
4,1008,Barromir,51


In [19]:
pd.merge(data1, data2, on = 'FellowshipID', how = 'right')

Unnamed: 0,FellowshipID,FirstName_x,Skills,FirstName_y,Age
0,1001,Frodo,Hiding,Frodo,50
1,1002,Samwise,Gardening,Samwise,39
2,1006,,,Legolas,2931
3,1007,,,Elrond,6520
4,1008,,,Barromir,51


In [None]:
pd.merge(data1, data2, on = 'FellowshipID', how = 'left')   ### In this case data1 is the Left dataset
### merge(Left, right, on = '', how = '')

In [20]:
data1

Unnamed: 0,FellowshipID,FirstName,Skills
0,1001,Frodo,Hiding
1,1002,Samwise,Gardening
2,1003,Gandalf,Spells
3,1004,Pippin,Fireworks


In [21]:
data2

Unnamed: 0,FellowshipID,FirstName,Age
0,1001,Frodo,50
1,1002,Samwise,39
2,1006,Legolas,2931
3,1007,Elrond,6520
4,1008,Barromir,51


In [22]:
pd.merge(data1, data2, on = 'FellowshipID', how = 'left')

Unnamed: 0,FellowshipID,FirstName_x,Skills,FirstName_y,Age
0,1001,Frodo,Hiding,Frodo,50.0
1,1002,Samwise,Gardening,Samwise,39.0
2,1003,Gandalf,Spells,,
3,1004,Pippin,Fireworks,,


In [23]:
pd.merge(data2, data1, on = 'FellowshipID', how = 'left')

Unnamed: 0,FellowshipID,FirstName_x,Age,FirstName_y,Skills
0,1001,Frodo,50,Frodo,Hiding
1,1002,Samwise,39,Samwise,Gardening
2,1006,Legolas,2931,,
3,1007,Elrond,6520,,
4,1008,Barromir,51,,


### Sample data

In [30]:
dic = {
    'name' : ['Anil', 'Sam', 'Riya', 'Reema', 'Seema', 'Sunil', 'Alok', 'Sita'],
    'Age' : [23, 28, 35, 47, 56, 22, 30, 40],
    'Gender' : ['M', 'M', 'F', 'F', 'F', 'M','M', 'F'],
    'Salary' : [25000, 35000, 45000, 50000, 65000, 75000, 85000, 95000]
}
dic

{'name': ['Anil', 'Sam', 'Riya', 'Reema', 'Seema', 'Sunil', 'Alok', 'Sita'],
 'Age': [23, 28, 35, 47, 56, 22, 30, 40],
 'Gender': ['M', 'M', 'F', 'F', 'F', 'M', 'M', 'F'],
 'Salary': [25000, 35000, 45000, 50000, 65000, 75000, 85000, 95000]}

In [31]:
data = pd.DataFrame(dic)
data

Unnamed: 0,name,Age,Gender,Salary
0,Anil,23,M,25000
1,Sam,28,M,35000
2,Riya,35,F,45000
3,Reema,47,F,50000
4,Seema,56,F,65000
5,Sunil,22,M,75000
6,Alok,30,M,85000
7,Sita,40,F,95000


### Calculate 10% hiked salary for all employees and create a new column 'Updated Salary' in the dataset

In [32]:
data['Salary'] * 1.1

0     27500.0
1     38500.0
2     49500.0
3     55000.0
4     71500.0
5     82500.0
6     93500.0
7    104500.0
Name: Salary, dtype: float64

In [33]:
data['Hiked Salary'] = data['Salary'] * 1.1
data

Unnamed: 0,name,Age,Gender,Salary,Hiked Salary
0,Anil,23,M,25000,27500.0
1,Sam,28,M,35000,38500.0
2,Riya,35,F,45000,49500.0
3,Reema,47,F,50000,55000.0
4,Seema,56,F,65000,71500.0
5,Sunil,22,M,75000,82500.0
6,Alok,30,M,85000,93500.0
7,Sita,40,F,95000,104500.0


### Create Age bracket from the Age column where the conditions are 
1. 0 - 20 = Young
2. 20 - 40 = Adult
3. 40 - 60 = Old
4. '>' 60 = Retired

In [34]:
def age_bracket(x):
    if x < 20:
        return 'Young'
    elif x >= 20 and x < 40:
        return 'Adult'
    elif x >= 40 and x < 60:
        return 'Old'
    else:
        return 'Retired'

In [46]:
data['Age Bracket'] = data['Age'].apply(age_bracket)
data

Unnamed: 0,name,Age,Gender,Salary,Hiked Salary,Age Bracket
0,Anil,23,M,25000,27500.0,Adult
1,Sam,28,M,35000,38500.0,Adult
2,Riya,35,F,45000,49500.0,Adult
3,Reema,47,F,50000,55000.0,Old
4,Seema,56,F,65000,71500.0,Old
5,Sunil,22,M,75000,82500.0,Adult
6,Alok,30,M,85000,93500.0,Adult
7,Sita,40,F,95000,104500.0,Old


In [47]:
data = pd.DataFrame(dic)
data

Unnamed: 0,name,Age,Gender,Salary
0,Anil,23,M,25000
1,Sam,28,M,35000
2,Riya,35,F,45000
3,Reema,47,F,50000
4,Seema,56,F,65000
5,Sunil,22,M,75000
6,Alok,30,M,85000
7,Sita,40,F,95000


## Calculate the 10% Hiked Salary and Store the result in a new column 'Hiked Salary'

In [48]:
data['Hiked Salary'] = data['Salary'].apply(lambda x : x * 1.1)
data

Unnamed: 0,name,Age,Gender,Salary,Hiked Salary
0,Anil,23,M,25000,27500.0
1,Sam,28,M,35000,38500.0
2,Riya,35,F,45000,49500.0
3,Reema,47,F,50000,55000.0
4,Seema,56,F,65000,71500.0
5,Sunil,22,M,75000,82500.0
6,Alok,30,M,85000,93500.0
7,Sita,40,F,95000,104500.0


In [50]:
data['Age bracket'] = data['Age'].apply(age_bracket)
data

Unnamed: 0,name,Age,Gender,Salary,Hiked Salary,Age bracket
0,Anil,23,M,25000,27500.0,Adult
1,Sam,28,M,35000,38500.0,Adult
2,Riya,35,F,45000,49500.0,Adult
3,Reema,47,F,50000,55000.0,Old
4,Seema,56,F,65000,71500.0,Old
5,Sunil,22,M,75000,82500.0,Adult
6,Alok,30,M,85000,93500.0,Adult
7,Sita,40,F,95000,104500.0,Old


### Save the above Dataset in a csv file on your computer

In [59]:
data.to_csv('SalaryData.csv', index = False)

In [60]:
data.to_csv('SalaryData.tsv', sep = '\t', index = False)

In [61]:
data.to_excel('SalaryData.xlsx', index = False)

### Load the seaborn inbuilt tips data

In [62]:
data = sns.load_dataset('tips')
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### What percent of Male Customers are smokers?

In [63]:
### sex = Male
### Smoker = Yes
## % 

In [64]:
data[(data['sex'] == 'Male') & (data['smoker'] == 'Yes')]['sex'].count() / len(data) * 100

24.59016393442623

In [66]:
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### What percent of customers are visiting for Dinner?

In [67]:
data['time'].value_counts(normalize = True) * 100

Dinner    72.131148
Lunch     27.868852
Name: time, dtype: float64

- Observation 72.13 % of customer are visiting only for Dinner

### Percentage of for every Customer visiting both Lunch and Dinner

In [70]:
data.groupby(['sex', 'time'])['size'].agg('count') /len(data) * 100

sex     time  
Male    Lunch     13.524590
        Dinner    50.819672
Female  Lunch     14.344262
        Dinner    21.311475
Name: size, dtype: float64

### What % of Customer like to visit the restaurant on Weekdays and on Weekends

In [71]:
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [72]:
data.groupby(['sex', 'day'])['size'].agg('count') / len(data) * 100

sex     day 
Male    Thur    12.295082
        Fri      4.098361
        Sat     24.180328
        Sun     23.770492
Female  Thur    13.114754
        Fri      3.688525
        Sat     11.475410
        Sun      7.377049
Name: size, dtype: float64