In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("Sample data.csv")
data

Unnamed: 0,Name,Age,Salary,Gender
0,Anil,22,25000,M
1,Sunil,24,35000,M
2,Ashok,26,45000,M
3,Seema,28,55000,F
4,Reema,30,65000,F


In [3]:
## Display the total rows and columns present in the data

data.shape

(5, 4)

In [4]:
## Display all the column names of the data

data.columns

Index(['Name', 'Age', 'Salary', 'Gender'], dtype='object')

In [5]:
## Display the data types of each column of the data

data.dtypes

Name      object
Age        int64
Salary     int64
Gender    object
dtype: object

In [6]:
## Display the data type of the data

type(data)

pandas.core.frame.DataFrame

In [7]:
## Get the summary of the data

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   Salary  5 non-null      int64 
 3   Gender  5 non-null      object
dtypes: int64(2), object(2)
memory usage: 288.0+ bytes


In [8]:
dic = {
    "Name" : ["Anesha", "Arya", "Ajit", "Kumar", "Santosh"],
    "Age" : [34, 57, 28, np.nan, 40],    ## np.nan -> missing value
    "Gender" : ["F", "F", "M", "M", "M"],
    "Salary" : [56000, np.nan, 23000, 45000, np.nan]   ## np.nan -> value missing for that person
}
dic

{'Name': ['Anesha', 'Arya', 'Ajit', 'Kumar', 'Santosh'],
 'Age': [34, 57, 28, nan, 40],
 'Gender': ['F', 'F', 'M', 'M', 'M'],
 'Salary': [56000, nan, 23000, 45000, nan]}

In [9]:
df = pd.DataFrame(dic)
df

Unnamed: 0,Name,Age,Gender,Salary
0,Anesha,34.0,F,56000.0
1,Arya,57.0,F,
2,Ajit,28.0,M,23000.0
3,Kumar,,M,45000.0
4,Santosh,40.0,M,


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     4 non-null      float64
 2   Gender  5 non-null      object 
 3   Salary  3 non-null      float64
dtypes: float64(2), object(2)
memory usage: 288.0+ bytes


# Note:
- Non-Null count gives us the idea of missing value present in the column

## Arithematic functions on the data

In [11]:
data

Unnamed: 0,Name,Age,Salary,Gender
0,Anil,22,25000,M
1,Sunil,24,35000,M
2,Ashok,26,45000,M
3,Seema,28,55000,F
4,Reema,30,65000,F


In [13]:
## Display the total salary of all the employees

data["Salary"].sum()

225000

In [14]:
## Display the average salary of all the employees

data["Salary"].mean()

45000.0

In [15]:
## Display the min salary of all the employees

data["Salary"].min()

25000

In [17]:
## Display the max salary of all the employees

data["Salary"].max()

65000

In [18]:
## Display the median salary of the employees

data["Salary"].median()

45000.0

In [19]:
## Display total employees working in the organization

data["Salary"].count()

5

In [22]:
## Display summary statistics of all the "numerical" column of the data

data.describe()

Unnamed: 0,Age,Salary
count,5.0,5.0
mean,26.0,45000.0
std,3.162278,15811.388301
min,22.0,25000.0
25%,24.0,35000.0
50%,26.0,45000.0
75%,28.0,55000.0
max,30.0,65000.0


## How to change the order of columns in the data?

In [23]:
data

Unnamed: 0,Name,Age,Salary,Gender
0,Anil,22,25000,M
1,Sunil,24,35000,M
2,Ashok,26,45000,M
3,Seema,28,55000,F
4,Reema,30,65000,F


In [24]:
col_order = ["Salary", "Gender", "Name", "Age"]

data = data[col_order]
data

Unnamed: 0,Salary,Gender,Name,Age
0,25000,M,Anil,22
1,35000,M,Sunil,24
2,45000,M,Ashok,26
3,55000,F,Seema,28
4,65000,F,Reema,30


## value_counts()

In [25]:
data

Unnamed: 0,Salary,Gender,Name,Age
0,25000,M,Anil,22
1,35000,M,Sunil,24
2,45000,M,Ashok,26
3,55000,F,Seema,28
4,65000,F,Reema,30


In [26]:
## Display the unique genders present in the data

data["Gender"].unique()

array(['M', 'F'], dtype=object)

In [27]:
## Display the total count of uniqe genders present in the data

data['Gender'].nunique()

2

In [28]:
## Display the count of each gender 

data["Gender"].value_counts()

M    3
F    2
Name: Gender, dtype: int64

In [29]:
## Dispaly the percentage count of each gender

data["Gender"].value_counts(normalize = True) * 100

M    60.0
F    40.0
Name: Gender, dtype: float64

In [30]:
## Display the summary statistics for object columns

data.describe(include = "O")   ## "O" for Object columns

Unnamed: 0,Gender,Name
count,5,5
unique,2,5
top,M,Anil
freq,3,1


# Note:

- For Name col, the summary statistics does not make sense as all names are unique hence it can be ignored.
- If in any dataset, the column having unique value can be ignore while display summary statistics

## For checking duplicate values

In [31]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [39]:
dic = {
    "Name" : ["Anesha", "Arya", "Ajit", "Kumar", "Santosh", "Anesha"],
    "Age" : [34, 57, 28, np.nan, 40, 49],    ## np.nan -> missing value
    "Gender" : ["F", "F", "M", "M", "M", "F"],
    "Salary" : [56000, np.nan, 23000, 45000, np.nan, 80000]   ## np.nan -> value missing for that person
}
dic
df = pd.DataFrame(dic)
df

Unnamed: 0,Name,Age,Gender,Salary
0,Anesha,34.0,F,56000.0
1,Arya,57.0,F,
2,Ajit,28.0,M,23000.0
3,Kumar,,M,45000.0
4,Santosh,40.0,M,
5,Anesha,49.0,F,80000.0


In [35]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool

In [36]:
df.describe(include = "O")

Unnamed: 0,Name,Gender
count,6,6
unique,5,2
top,Anesha,F
freq,2,3


In [38]:
df['Name'].duplicated()

0    False
1    False
2    False
3    False
4    False
5     True
Name: Name, dtype: bool

# Note:

- When checking for duplicates check for the entire row which is data.duplicated()
- Do not check duplicate value for every single column in the data

- data is present in excel will never be modified by the change done in jupyter notebook
- data in jupyter notebook can be modified and all modification will be in jupyter notebook only.


In [40]:
data

Unnamed: 0,Salary,Gender,Name,Age
0,25000,M,Anil,22
1,35000,M,Sunil,24
2,45000,M,Ashok,26
3,55000,F,Seema,28
4,65000,F,Reema,30


In [41]:
data.to_csv("Modified_data.csv")

## Applying the above concepts on titanic dataset

In [42]:
data = pd.read_csv("titanicdata.csv")
data

Unnamed: 0.1,Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


### Display total rows and total column from the data

In [43]:
data.shape

(891, 16)

## Note:

- 1 single row in the data represents 1 passenger's detials
- if there are 891 rows in the data, it simply means that the data has 891 passenger's details

In [45]:
## How many passengers survided and how many not survived?

data["survived"].value_counts()

## 0 - Not survived
## 1 - Survived

0    549
1    342
Name: survived, dtype: int64

In [46]:
data["survived"].value_counts(normalize = True) * 100

0    61.616162
1    38.383838
Name: survived, dtype: float64

In [47]:
## How many male passenger's were travelling and how many female passengers ?

data['sex'].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [48]:
data['sex'].value_counts(normalize = True) * 100

male      64.758698
female    35.241302
Name: sex, dtype: float64

In [50]:
## display the summary statistics for Fare

data["fare"].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: fare, dtype: float64

# Note:
- min fare is 0
- max fare is 512.32
- avg fare is 32.2


In [51]:
## Display the passengers travelling through different classes

data["class"].value_counts()

Third     491
First     216
Second    184
Name: class, dtype: int64

In [52]:
data["class"].value_counts(normalize = True) * 100

Third     55.106622
First     24.242424
Second    20.650954
Name: class, dtype: float64

In [53]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   sex          891 non-null    object 
 4   age          714 non-null    float64
 5   sibsp        891 non-null    int64  
 6   parch        891 non-null    int64  
 7   fare         891 non-null    float64
 8   embarked     889 non-null    object 
 9   class        891 non-null    object 
 10  who          891 non-null    object 
 11  adult_male   891 non-null    bool   
 12  deck         203 non-null    object 
 13  embark_town  889 non-null    object 
 14  alive        891 non-null    object 
 15  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(5), object(7)
memory usage: 99.3+ KB


## Observation:
- By inspecting, the excel file we get to know that Unnamed: 0 is just a column with sequential numbers which is of no use to our analysis. Hence, we delete the column from the data

In [55]:
data = data.drop("Unnamed: 0", axis = 1)
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [56]:
## Display the count of passengers boarding from different locations

data["embark_town"].value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [57]:
data["embark_town"].value_counts(normalize = True) * 100

Southampton    72.440945
Cherbourg      18.897638
Queenstown      8.661417
Name: embark_town, dtype: float64