# Pandas Tutorial

In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.__version__

'2.1.1'

In [3]:
list = [i for i in range(1,7)]

In [4]:
series = pd.Series(list)
print(series)
print(type(series))

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64
<class 'pandas.core.series.Series'>


In [5]:
empty = pd.Series([])
print(empty)

Series([], dtype: object)


In [6]:
# custom indexes
a = pd.Series(['p','q','r','s','t'], index = [10,11,12,13,14], name ="Alphabets")
a

10    p
11    q
12    r
13    s
14    t
Name: Alphabets, dtype: object

In [7]:
scaler_series = pd.Series(0.5)
scaler_series

0    0.5
dtype: float64

In [8]:
scaler_series = pd.Series(0.4, index = [1,2,3])
scaler_series

1    0.4
2    0.4
3    0.4
dtype: float64

**Pandas Series with Python Dictionary**

In [9]:
d_series = pd.Series({'p':1,'q':2,'r':3,'s':4,'t':5})
d_series

p    1
q    2
r    3
s    4
t    5
dtype: int64

In [10]:
d_series[0]

  d_series[0]


1

In [11]:
d_series[0:3]

p    1
q    2
r    3
dtype: int64

In [12]:
max(d_series)

5

In [13]:
d_s2 = pd.Series({'p':[1,5,6],'q':[4,1,3],'r':[8,2,4],'s':[4,4,5],'t':[5,1,2]})
d_s2

p    [1, 5, 6]
q    [4, 1, 3]
r    [8, 2, 4]
s    [4, 4, 5]
t    [5, 1, 2]
dtype: object

# Pandas DataFrame

In [14]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


# #DataFrame Using List

In [15]:
lst = [1,2,3,4,5]
df = pd.DataFrame(lst)
print(df)

   0
0  1
1  2
2  3
3  4
4  5


In [16]:
#DataFrame using a list
lst = [[1,2,3,4,5],[11,12,13,14,15]]
df = pd.DataFrame(lst)
print(df)

    0   1   2   3   4
0   1   2   3   4   5
1  11  12  13  14  15


In [17]:
#DataFrame using a Dictionary
a = [{'a':10,'b':20,'c':30},{'a':100, 'c':300}]

#Dictionary keys represents column names

df = pd.DataFrame(a)
print(df)

     a     b    c
0   10  20.0   30
1  100   NaN  300


In [18]:
#DataFrame using Pandas Series

b = {'RollNo': pd.Series([1,2,3,4,5]),
    'DSA': pd.Series([9.0, 9.3, 9.1,8.7,8.5]),
    'COA': pd.Series([8.6,9.1,8.9,9.2,7.8])}
df = pd.DataFrame(b)
print(df)

   RollNo  DSA  COA
0       1  9.0  8.6
1       2  9.3  9.1
2       3  9.1  8.9
3       4  8.7  9.2
4       5  8.5  7.8


# Reading CSV as DataFrames

In [19]:
import pandas as pd

In [20]:

df = pd.read_csv(r'D:\00_WorkSpace\pythonCodes\NUMPY\emp_details.csv')
df

Unnamed: 0,EmpID,EmpName,Salary,Bonus
0,E001,A,70176,7017.6
1,E002,B,93978,9397.8
2,E003,C,66099,6609.9
3,E004,D,81013,8101.3
4,E005,E,89413,8941.3
5,E006,F,68737,6873.7
6,E007,G,98467,9846.7
7,E008,H,82061,8206.1
8,E009,I,74138,7413.8
9,E010,J,71910,7191.0


In [21]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


# DataFrame Functions

In [22]:
df = pd.read_csv(r'D:\00_WorkSpace\pythonCodes\NUMPY\emp_details.csv')

In [23]:
df.columns

Index(['EmpID', 'EmpName', 'Salary', 'Bonus'], dtype='object')

In [24]:
df.shape

(20, 4)

In [25]:
df.size

80

In [26]:
df.head() #Gives first 5 rows

Unnamed: 0,EmpID,EmpName,Salary,Bonus
0,E001,A,70176,7017.6
1,E002,B,93978,9397.8
2,E003,C,66099,6609.9
3,E004,D,81013,8101.3
4,E005,E,89413,8941.3


In [27]:
df.head(4) #gives first 4 rows

Unnamed: 0,EmpID,EmpName,Salary,Bonus
0,E001,A,70176,7017.6
1,E002,B,93978,9397.8
2,E003,C,66099,6609.9
3,E004,D,81013,8101.3


In [28]:
df.tail() #gives last 5 rows (deafault)

Unnamed: 0,EmpID,EmpName,Salary,Bonus
15,E016,P,60574,6057.4
16,E017,Q,65780,6578.0
17,E018,R,88676,8867.6
18,E019,S,90968,9096.8
19,E020,T,65374,6537.4


In [29]:
df.tail(3) #gives last 3 rows

Unnamed: 0,EmpID,EmpName,Salary,Bonus
17,E018,R,88676,8867.6
18,E019,S,90968,9096.8
19,E020,T,65374,6537.4


In [30]:
df.describe() #description of numerical data

Unnamed: 0,Salary,Bonus
count,20.0,20.0
mean,76963.9,7696.39
std,11124.257245,1112.425724
min,60574.0,6057.4
25%,66856.5,6685.65
50%,75817.5,7581.75
75%,85037.0,8503.7
max,98467.0,9846.7


In [31]:
df.info() # it gives the information aobut data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   EmpID    20 non-null     object 
 1   EmpName  20 non-null     object 
 2   Salary   20 non-null     int64  
 3   Bonus    20 non-null     float64
dtypes: float64(1), int64(1), object(2)
memory usage: 768.0+ bytes


In [33]:
df2 = pd.read_csv(r"D:\00_WorkSpace\pythonCodes\NUMPY\StudentsPerformance.csv")
df2.head()

Unnamed: 0,gender,math score,reading score,writing score
0,female,72.0,72.0,74.0
1,female,69.0,90.0,88.0
2,female,90.0,95.0,93.0
3,male,47.0,57.0,44.0
4,male,76.0,78.0,75.0


In [35]:
df2.isnull() #to check the dataframe has a null value

Unnamed: 0,gender,math score,reading score,writing score
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,True,False
7,False,False,False,False
8,False,True,False,False
9,False,False,False,True


In [37]:
df2.isnull().sum()

gender           0
math score       7
reading score    4
writing score    7
dtype: int64

In [39]:
df2.isnull().sum().sum()

18

**Droping Rows with NaN values**

In [42]:
df2.shape

(55, 4)

In [43]:
df3 = df2.dropna() # axis = 0 default

In [44]:
df3.shape

(39, 4)

**Dropping Columns with NaN values**

``` df3_1 = df2.dropna(axis= 1) ``` <br>
*This will drop the column having a NULL value*

In [45]:
df2.dropna(how="any") #drop the row if any value in the row is null

Unnamed: 0,gender,math score,reading score,writing score
0,female,72.0,72.0,74.0
1,female,69.0,90.0,88.0
2,female,90.0,95.0,93.0
3,male,47.0,57.0,44.0
4,male,76.0,78.0,75.0
5,female,71.0,83.0,78.0
7,male,40.0,43.0,39.0
10,male,58.0,54.0,52.0
11,male,40.0,52.0,43.0
12,female,65.0,81.0,73.0


In [46]:
df2.dropna(how = "all")  #drop the row if all the values are null

Unnamed: 0,gender,math score,reading score,writing score
0,female,72.0,72.0,74.0
1,female,69.0,90.0,88.0
2,female,90.0,95.0,93.0
3,male,47.0,57.0,44.0
4,male,76.0,78.0,75.0
5,female,71.0,83.0,78.0
6,female,88.0,,92.0
7,male,40.0,43.0,39.0
8,male,,64.0,67.0
9,female,38.0,60.0,


*Same Goes for columns* <br>
```df2.dropna(axis = 1, how = "any") ``` <br>
```df2.dropna(axis = 1, how = "all") ``` <br>