# Pandas

- Pandas is an open-source library that is made mainly for working with relational or labeled data both easily and intuitively. 
- It provides various data structures and operations for manipulating numerical data and time series. 
- This library is built on top of the NumPy library. 
- Pandas is fast and it has high performance & productivity for users.
- Pandas generally provide two data structures for manipulating data, They are: 
   - Series
   - DataFrame

In [1]:
#Importing necessary libraries

In [2]:
import pandas as pd

import warnings

warnings.filterwarnings('ignore')

# Series
- Series is a one-dimensional labeled array and capable of holding data of any type (integer, string, float, python objects, etc.)

In [3]:
#Series creation.

In [4]:
lst = [1,2,3,4,5]
type(lst)

list

In [5]:
s = pd.Series(lst)
type(s)

pandas.core.series.Series

In [6]:
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [7]:
s = pd.Series(data = ['A',10,2500])
s

0       A
1      10
2    2500
dtype: object

In [8]:
d = {1:'A',2:'B',3:'C'}
type(d)

dict

In [9]:
s = pd.Series(d)
s

1    A
2    B
3    C
dtype: object

In [10]:
s = pd.Series(data = [10,20,30],index=['a','b','c'],name = 'My_series')
s

a    10
b    20
c    30
Name: My_series, dtype: int64

In [11]:
#Accessing elements from a series using index labels.

In [12]:
s[0]

10

In [13]:
s['c']

30

In [14]:
s[0:3]

a    10
b    20
c    30
Name: My_series, dtype: int64

In [15]:
s[0]

10

In [16]:
s['a']

10

# DataFrame
- A Pandas DataFrame is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns

**Ways to create pandas DataFrame.**
1. By creating a nested lists and colname lists.
2. By creating a dictionary

In [17]:
lst = [['Python',90,'Pune'],['ML',100,'Mumbai'],['DL',95,'HYD']]

col = ['Subject','Marks','City']

In [18]:
df = pd.DataFrame(data=lst, columns=col)

In [19]:
df

Unnamed: 0,Subject,Marks,City
0,Python,90,Pune
1,ML,100,Mumbai
2,DL,95,HYD


In [20]:
#Approach 2:

In [21]:
df = pd.DataFrame({
    'Subject' : ['Python','ML','DL'],
    'Marks' : [90,70,100],
    'City Name' : ['Pune','Mumbai','Hyd']
})

In [22]:
df

Unnamed: 0,Subject,Marks,City Name
0,Python,90,Pune
1,ML,70,Mumbai
2,DL,100,Hyd


In [23]:
#Check number of rows and columns in dataframe

In [24]:
df.shape

(3, 3)

In [25]:
#Dimesion of the dataframe

In [26]:
df.ndim

2

In [27]:
#Data type of each column

In [28]:
df.dtypes

Subject      object
Marks         int64
City Name    object
dtype: object

In [29]:
df.Subject

0    Python
1        ML
2        DL
Name: Subject, dtype: object

In [30]:
df[['Subject']]

Unnamed: 0,Subject
0,Python
1,ML
2,DL


In [31]:
df

Unnamed: 0,Subject,Marks,City Name
0,Python,90,Pune
1,ML,70,Mumbai
2,DL,100,Hyd


In [32]:
df['City Name']

0      Pune
1    Mumbai
2       Hyd
Name: City Name, dtype: object

In [33]:
#Descriptive stats for df

In [34]:
df.describe()

Unnamed: 0,Marks
count,3.0
mean,86.666667
std,15.275252
min,70.0
25%,80.0
50%,90.0
75%,95.0
max,100.0


In [35]:
df.describe(include='O')

Unnamed: 0,Subject,City Name
count,3,3
unique,3,3
top,Python,Pune
freq,1,1


In [36]:
df.describe(include='all')

Unnamed: 0,Subject,Marks,City Name
count,3,3.0,3
unique,3,,3
top,Python,,Pune
freq,1,,1
mean,,86.666667,
std,,15.275252,
min,,70.0,
25%,,80.0,
50%,,90.0,
75%,,95.0,


In [37]:
#Getting information about the dataframe.

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Subject    3 non-null      object
 1   Marks      3 non-null      int64 
 2   City Name  3 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes


In [39]:
#Checking null values

In [40]:
df.isnull().sum()

Subject      0
Marks        0
City Name    0
dtype: int64

# Importing External Dataset 

In [41]:
#Load the csv file
df = pd.read_csv('Salaries.csv')

In [42]:
df

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
...,...,...,...,...,...,...
73,Prof,B,18,10,Female,105450
74,AssocProf,B,19,6,Female,104542
75,Prof,B,17,17,Female,124312
76,Prof,A,28,14,Female,109954


In [43]:
#Reading first 5 records

In [44]:
df.head()

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800


In [45]:
#Reading last 5 records

In [46]:
df.tail()

Unnamed: 0,rank,discipline,phd,service,sex,salary
73,Prof,B,18,10,Female,105450
74,AssocProf,B,19,6,Female,104542
75,Prof,B,17,17,Female,124312
76,Prof,A,28,14,Female,109954
77,Prof,A,23,15,Female,109646


In [47]:
df.head(10)

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
5,Prof,A,20,20,Male,122400
6,AssocProf,A,20,17,Male,81285
7,Prof,A,18,18,Male,126300
8,Prof,A,29,19,Male,94350
9,Prof,A,51,51,Male,57800


In [48]:
#Reading all the records.

In [49]:
pd.set_option('display.max_rows',None)

In [50]:
df

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
5,Prof,A,20,20,Male,122400
6,AssocProf,A,20,17,Male,81285
7,Prof,A,18,18,Male,126300
8,Prof,A,29,19,Male,94350
9,Prof,A,51,51,Male,57800


In [51]:
#Describe the data

In [52]:
df.describe()

Unnamed: 0,phd,service,salary
count,78.0,78.0,78.0
mean,19.705128,15.051282,108023.782051
std,12.498425,12.139768,28293.661022
min,1.0,0.0,57800.0
25%,10.25,5.25,88612.5
50%,18.5,14.5,104671.0
75%,27.75,20.75,126774.75
max,56.0,51.0,186960.0


In [53]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
rank,78.0,3.0,Prof,46.0,,,,,,,
discipline,78.0,2.0,B,42.0,,,,,,,
phd,78.0,,,,19.705128,12.498425,1.0,10.25,18.5,27.75,56.0
service,78.0,,,,15.051282,12.139768,0.0,5.25,14.5,20.75,51.0
sex,78.0,2.0,Male,39.0,,,,,,,
salary,78.0,,,,108023.782051,28293.661022,57800.0,88612.5,104671.0,126774.75,186960.0


In [54]:
#Extracting only column names

In [55]:
df.columns

Index(['rank', 'discipline', 'phd', 'service', 'sex', 'salary'], dtype='object')

In [56]:
#Selecting single and multiple columns

In [57]:
df.phd

0     56
1     12
2     23
3     40
4     20
5     20
6     20
7     18
8     29
9     51
10    39
11    23
12     1
13    35
14    25
15    17
16     8
17     4
18    19
19    29
20     4
21    33
22     4
23     2
24    30
25    35
26    38
27    45
28     7
29    21
30     9
31    22
32    27
33    18
34    12
35    28
36    45
37    20
38     4
39    18
40    39
41    13
42     4
43     5
44    23
45    25
46    11
47    11
48    17
49    17
50    10
51    20
52    12
53     5
54    25
55     2
56    10
57     3
58    36
59    12
60     3
61    13
62    14
63    29
64    26
65    36
66     7
67    17
68     4
69    28
70     8
71    12
72    24
73    18
74    19
75    17
76    28
77    23
Name: phd, dtype: int64

In [58]:
df[['phd','salary']]

Unnamed: 0,phd,salary
0,56,186960
1,12,93000
2,23,110515
3,40,131205
4,20,104800
5,20,122400
6,20,81285
7,18,126300
8,29,94350
9,51,57800


In [59]:
#Selecting columns with loc and iloc

In [60]:
#loc function

In [61]:
df.head(2)

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000


In [62]:
df.loc[10:15,'phd']

10    39
11    23
12     1
13    35
14    25
15    17
Name: phd, dtype: int64

In [63]:
df.loc[[10,15], 'phd':'salary']

Unnamed: 0,phd,service,sex,salary
10,39,33,Male,128250
15,17,3,Male,150480


In [64]:
df.head(2)

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000


In [65]:
df.iloc[50:56,1]

50    B
51    B
52    A
53    A
54    A
55    A
Name: discipline, dtype: object

In [66]:
df.iloc[50:56,0:3]

Unnamed: 0,rank,discipline,phd
50,AsstProf,B,10
51,Prof,B,20
52,Prof,A,12
53,AsstProf,A,5
54,AssocProf,A,25
55,AsstProf,A,2


In [67]:
df.head(10)

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
5,Prof,A,20,20,Male,122400
6,AssocProf,A,20,17,Male,81285
7,Prof,A,18,18,Male,126300
8,Prof,A,29,19,Male,94350
9,Prof,A,51,51,Male,57800


In [68]:
#Value counts

In [69]:
df['rank'].value_counts()

Prof         46
AsstProf     19
AssocProf    13
Name: rank, dtype: int64

In [70]:
df['discipline'].value_counts()

B    42
A    36
Name: discipline, dtype: int64

In [71]:
df['sex'].value_counts()

Male      39
Female    39
Name: sex, dtype: int64

In [72]:
#Renaming column names

In [73]:
df.rename(columns = {'rank':'Rank'})

Unnamed: 0,Rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
5,Prof,A,20,20,Male,122400
6,AssocProf,A,20,17,Male,81285
7,Prof,A,18,18,Male,126300
8,Prof,A,29,19,Male,94350
9,Prof,A,51,51,Male,57800


In [74]:
df.rename(columns={'discipline':'Discipline','phd':'PHD'}, inplace=True)

In [75]:
df

Unnamed: 0,rank,Discipline,PHD,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
5,Prof,A,20,20,Male,122400
6,AssocProf,A,20,17,Male,81285
7,Prof,A,18,18,Male,126300
8,Prof,A,29,19,Male,94350
9,Prof,A,51,51,Male,57800
