### Pandas

- Library to work on tabular data.
- Dataframe - Special objects to store data in a table form(rows and columns).

In [1]:
!pip install pandas



In [2]:
import numpy as np
import pandas as pd

In [7]:
# Try to create a data frame
user_data = {
    "MarksA":np.random.randint(1,100,5),
    "MarksB":np.random.randint(50,100,5),
    "MarksC":np.random.randint(1,100,5)
}

In [8]:
np.random.randint(1,100,10)#lower bound, upper bound and the number of students it generates.

array([71, 98, 74, 64, 54, 17, 78, 21, 25, 23])

In [9]:
user_data

{'MarksA': array([44, 67, 68, 36, 65]),
 'MarksB': array([65, 96, 71, 57, 87]),
 'MarksC': array([10, 99, 65, 29, 40])}

In [10]:
df = pd.DataFrame(user_data)
print(df)

   MarksA  MarksB  MarksC
0      44      65      10
1      67      96      99
2      68      71      65
3      36      57      29
4      65      87      40


In [12]:
df.head(n=3)

Unnamed: 0,MarksA,MarksB,MarksC
0,44,65,10
1,67,96,99
2,68,71,65


In [14]:
df.columns #To see which all headers are present.

Index(['MarksA', 'MarksB', 'MarksC'], dtype='object')

In [15]:
df = pd.DataFrame(user_data,dtype='float32')
print(df)

   MarksA  MarksB  MarksC
0    44.0    65.0    10.0
1    67.0    96.0    99.0
2    68.0    71.0    65.0
3    36.0    57.0    29.0
4    65.0    87.0    40.0


In [16]:
df.head(n=3)

Unnamed: 0,MarksA,MarksB,MarksC
0,44.0,65.0,10.0
1,67.0,96.0,99.0
2,68.0,71.0,65.0


In [17]:
# Creating a CSV from Data Frame

In [18]:
df.to_csv('marks.csv')

In [19]:
my_data = pd.read_csv('marks.csv')

In [20]:
my_data

Unnamed: 0.1,Unnamed: 0,MarksA,MarksB,MarksC
0,0,44.0,65.0,10.0
1,1,67.0,96.0,99.0
2,2,68.0,71.0,65.0
3,3,36.0,57.0,29.0
4,4,65.0,87.0,40.0


In [21]:
my_data = pd.read_csv('marks.csv', index_col = False)

In [23]:
my_data

Unnamed: 0.1,Unnamed: 0,MarksA,MarksB,MarksC
0,0,44.0,65.0,10.0
1,1,67.0,96.0,99.0
2,2,68.0,71.0,65.0
3,3,36.0,57.0,29.0
4,4,65.0,87.0,40.0


In [24]:
my_data = my_data.drop(columns=['Unnamed: 0'])

In [25]:
my_data

Unnamed: 0,MarksA,MarksB,MarksC
0,44.0,65.0,10.0
1,67.0,96.0,99.0
2,68.0,71.0,65.0
3,36.0,57.0,29.0
4,65.0,87.0,40.0


In [26]:
# How to create, read and write a data frame.

### Pandas Basics - 2

In [27]:
my_data.describe()

Unnamed: 0,MarksA,MarksB,MarksC
count,5.0,5.0,5.0
mean,56.0,75.2,48.6
std,14.916434,16.006249,34.486229
min,36.0,57.0,10.0
25%,44.0,65.0,29.0
50%,65.0,71.0,40.0
75%,67.0,87.0,65.0
max,68.0,96.0,99.0


In [31]:
my_data.head(n=2)

Unnamed: 0,MarksA,MarksB,MarksC
0,44.0,65.0,10.0
1,67.0,96.0,99.0


In [32]:
my_data.tail(n=2)

Unnamed: 0,MarksA,MarksB,MarksC
3,36.0,57.0,29.0
4,65.0,87.0,40.0


In [33]:
# Accessing a particular element of the third row and MarksB

In [34]:
# Row 
# Method to access a row is iloc i.e. integer location of that row
df.iloc[3]

MarksA    36.0
MarksB    57.0
MarksC    29.0
Name: 3, dtype: float32

In [35]:
# Row & Col
df.iloc[3,1]

57.0

In [38]:
idx =[df.columns.get_loc('MarksB'),df.columns.get_loc('MarksC')]
print(idx) #column index 1 and 2

[1, 2]


In [39]:
df.iloc[3,idx] #3rd row, column index 1 and 2

MarksB    57.0
MarksC    29.0
Name: 3, dtype: float32

In [40]:
#If we want the results for the first 3 rows and the columns in idx
df.iloc[:3,idx]

Unnamed: 0,MarksB,MarksC
0,65.0,10.0
1,96.0,99.0
2,71.0,65.0


In [41]:
#If we want the results for the first 3 rows and the columns 1 and 2
df.iloc[:3,idx]
df.iloc[:3,[1,2]]

Unnamed: 0,MarksB,MarksC
0,65.0,10.0
1,96.0,99.0
2,71.0,65.0


In [42]:
## Sort your dataframe

In [43]:
my_data

Unnamed: 0,MarksA,MarksB,MarksC
0,44.0,65.0,10.0
1,67.0,96.0,99.0
2,68.0,71.0,65.0
3,36.0,57.0,29.0
4,65.0,87.0,40.0


In [44]:
my_data.sort_values(by=["MarksA"], ascending=True) #Sorted in ascending order

Unnamed: 0,MarksA,MarksB,MarksC
3,36.0,57.0,29.0
0,44.0,65.0,10.0
4,65.0,87.0,40.0
1,67.0,96.0,99.0
2,68.0,71.0,65.0


In [45]:
my_data.sort_values(by=["MarksA"], ascending=False) #Sorted in descending order

Unnamed: 0,MarksA,MarksB,MarksC
2,68.0,71.0,65.0
1,67.0,96.0,99.0
4,65.0,87.0,40.0
0,44.0,65.0,10.0
3,36.0,57.0,29.0


In [46]:
my_data.sort_values(by=["MarksC","MarksA"], ascending=False) #Sorted in descending order

Unnamed: 0,MarksA,MarksB,MarksC
1,67.0,96.0,99.0
2,68.0,71.0,65.0
4,65.0,87.0,40.0
3,36.0,57.0,29.0
0,44.0,65.0,10.0


In [47]:
data_array = my_data.values #To get the numpy array

In [49]:
print(type(data_array))

<class 'numpy.ndarray'>


In [50]:
print(data_array)

[[44. 65. 10.]
 [67. 96. 99.]
 [68. 71. 65.]
 [36. 57. 29.]
 [65. 87. 40.]]


In [51]:
print(my_data.shape)

(5, 3)


In [53]:
data_array.shape

(5, 3)

In [54]:
data_array[2][2]

65.0

In [55]:
# Numpy arrays back into dataframe

In [56]:
new_df = pd.DataFrame(data_array,dtype ='int32', columns=["Physics","Chem","Maths"])

In [57]:
new_df

Unnamed: 0,Physics,Chem,Maths
0,44,65,10
1,67,96,99
2,68,71,65
3,36,57,29
4,65,87,40


In [58]:
new_df.to_csv("PCM.csv")

In [60]:
new_df.to_csv("PCM.csv", index=False) #Without Index

In [61]:
pcm = pd.read_csv('PCM.csv')

In [62]:
pcm

Unnamed: 0,Physics,Chem,Maths
0,44,65,10
1,67,96,99
2,68,71,65
3,36,57,29
4,65,87,40


In [63]:
print(pcm)

   Physics  Chem  Maths
0       44    65     10
1       67    96     99
2       68    71     65
3       36    57     29
4       65    87     40
