# Pandas
"pandas is a software library written for the Python programming language for data manipulation and analysis. In particular, it offers data structures and operations for manipulating numerical tables and time series." Wikipedia

In [15]:
#we import numpy and pandas
import numpy as np
import pandas as pd

# Series
we can see a pandas serie as a sort of One dimensional array with labels

In [16]:
indexes = [1,2,3]
data = np.array(["abdessamad","youssef","nabil"])

In [17]:
#let's create a pandas serie based on our numpy array
serie_1 = pd.Series(data)
serie_1

0    abdessamad
1       youssef
2         nabil
dtype: object

In [18]:
#let's retrieve values from the serie based on indexes
print(serie_1[0],serie_1[1],serie_1[2])

abdessamad youssef nabil


In [19]:
#let's create a pandas serie based on our numpy array and by adding the parameter indexes 
serie_2 = pd.Series(data,index=indexes)
serie_2

1    abdessamad
2       youssef
3         nabil
dtype: object

In [20]:
#we can also create a pandas serie based on a dictionnary and we get the same result
dico = {"One":"abdessamad","Two":"youssef","Three":"nabil"}
serie_3 = pd.Series(dico)
serie_3

One      abdessamad
Two         youssef
Three         nabil
dtype: object

In [21]:
#let's retrieve values from the serie based on the indexes we chose
serie_3["One"],serie_3["Two"],serie_3["Three"]

('abdessamad', 'youssef', 'nabil')

In [22]:
# a series can hold any type of data 
pd.Series(data=indexes)

0    1
1    2
2    3
dtype: int64

In [23]:
#we can even pass in built in function 
pd.Series(data=[min,max])

0    <built-in function min>
1    <built-in function max>
dtype: object

In [24]:
#we create a pandas serie containning marks of the first part of an exam,as we we can see in the parameters the order does not matter if we specify the name of the parameter
marks_first_part_exam = pd.Series(index=["abdessamad","youssef","nabil"],data=[9,7,6])
marks_first_part_exam

abdessamad    9
youssef       7
nabil         6
dtype: int64

In [25]:
#we create a pandas serie containning marks of the second part of an exam,we dont have to name the parameters but the order will matter if we do so
marks_second_part_exam = pd.Series([9,5,9],["abdessamad","youssef","driss"])
marks_second_part_exam

abdessamad    9
youssef       5
driss         9
dtype: int64

In [27]:
#we can see when an index is present in both pandas series we just add the value corresponding to that index,but if an index is present in one serie and not in the other serie we get a nan value 
marks_first_part_exam + marks_second_part_exam

abdessamad    18.0
driss          NaN
nabil          NaN
youssef       12.0
dtype: float64

# DataFrame
we can see dataframes as two dimensional labaled arrays 

In [31]:
data_frame = pd.DataFrame(data=np.random.randint(12,18,(4,4)),columns=["advanced Liner algebra","statistics","probability","Data analysis"],index=["abdessamad","youssef","taha","nabil"])
print(data_frame,"\n\n"+str(type(data_frame)))

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      13          12           17             17
youssef                         13          14           12             13
taha                            13          17           16             13
nabil                           16          15           16             17 

<class 'pandas.core.frame.DataFrame'>


In [106]:
#each column of the dataframe is a pandas serie
serie1 = data_frame["advanced Liner algebra"]
print(serie1,"\n\n"+str(type(serie1)))

abdessamad    15
youssef       12
taha          14
nabil         14
Name: advanced Liner algebra, dtype: int64 

<class 'pandas.core.series.Series'>


In [108]:
#let's get multiple columns by passing on a list of the labels of colums,once we retrieve more than one column we will get a pandas dataframe and not a serie 
print(data_frame[["advanced Liner algebra","statistics"]],"\n\n"+str(type(data_frame[["advanced Liner algebra","statistics"]])))



            advanced Liner algebra  statistics
abdessamad                      15          16
youssef                         12          13
taha                            14          17
nabil                           14          14 

<class 'pandas.core.frame.DataFrame'>


In [109]:
#let's add some columns to our dataframe
data_frame["ML"] = np.random.randint(10,18,4)
print(data_frame)

            advanced Liner algebra  statistics  probability  Data analysis  ML
abdessamad                      15          16           14             17  11
youssef                         12          13           14             12  17
taha                            14          17           13             16  11
nabil                           14          14           12             16  17


In [110]:
#now let's remove a column,axis=1 allows us to tell pandas that "saad" is present on the columns,inplace=True allows us to modify the dataframe permanently
data_frame.drop("ML",axis=1,inplace=True)
print(data_frame)

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      15          16           14             17
youssef                         12          13           14             12
taha                            14          17           13             16
nabil                           14          14           12             16


In [111]:
#let's select rows from our dataframe,the loc method allows us to retrieve rows and columns by label
data_frame.loc["abdessamad"]

advanced Liner algebra    15
statistics                16
probability               14
Data analysis             17
Name: abdessamad, dtype: int64

In [132]:
#now we select rows using the iloc method,wich is based on the location and not a label
print(data_frame.iloc[0],"\n\n"+str(type(data_frame.iloc[0])))

advanced Liner algebra    15
statistics                16
probability               14
Data analysis             17
Name: abdessamad, dtype: int64 

<class 'pandas.core.series.Series'>


In [117]:
#we select multiple rows by passing the labels in a list
print(data_frame.loc[["abdessamad","nabil"]])

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      15          16           14             17
nabil                           14          14           12             16


In [118]:
#we select multiple rows by passing a list integers corresponding to the location of the row 
print(data_frame.iloc[[0,1]])

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      15          16           14             17
youssef                         12          13           14             12


In [125]:
#let's add a new row to our dataframe
data_frame.loc["saad"] = np.random.randint(10,18,4)
print(data_frame)

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      15          16           14             17
youssef                         12          13           14             12
taha                            14          17           13             16
nabil                           14          14           12             16
saad                            14          17           14             13


In [126]:
#now let's drop a row from our pandas dataframe,axis=0 allows us to specify that we want to delete a row and inplace allows us to make a permanent delete in our dataframe
data_frame.drop("saad",axis=0,inplace=True)
print(data_frame)

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      15          16           14             17
youssef                         12          13           14             12
taha                            14          17           13             16
nabil                           14          14           12             16


In [144]:
#let's look on how we can subset rows and columns
#first method ,label based
print(data_frame.loc[["nabil","taha"] ,["statistics","probability"]])

       statistics  probability
nabil          14           12
taha           17           13


In [148]:
#second methid ,index based
print(data_frame.iloc[[2,3],[1,3]])

       statistics  Data analysis
taha           17             16
nabil          14             16


In [149]:
#new we will look at conditional selection

In [None]:
marks = data_frame
marks["ML"] = np.random.randint(10,20,4)
marks.loc["saad"] = np.random.randint(10,18,5)

In [157]:
print(marks)

            advanced Liner algebra  statistics  probability  Data analysis  ML
abdessamad                      15          16           14             17  13
youssef                         12          13           14             12  15
taha                            14          17           13             16  16
nabil                           14          14           12             16  18
saad                            10          11           14             10  13


In [161]:
#let's look at that marks that are superior to 15,where we have true the mark is superior to 15
marks>15

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
abdessamad,False,True,False,True,False
youssef,False,False,False,False,False
taha,False,True,False,True,True
nabil,False,False,False,True,True
saad,False,False,False,False,False


In [162]:
#we selected the marks that are superior to 15 
marks[marks>15]

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
abdessamad,,16.0,,17.0,
youssef,,,,,
taha,,17.0,,16.0,16.0
nabil,,,,16.0,18.0
saad,,,,,


In [164]:
#let's see the marks that are superior to 15 in the subject statistics ,we will only be filtring the subject of statistic
marks["statistics"]>15

abdessamad     True
youssef       False
taha           True
nabil         False
saad          False
Name: statistics, dtype: bool

In [167]:
#we only get the students that got a mark superior to 15 in statistics
marks[marks["statistics"]>15]

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
abdessamad,15,16,14,17,13
taha,14,17,13,16,16


In [168]:
marks[marks["statistics"]>15]["statistics"]

abdessamad    16
taha          17
Name: statistics, dtype: int64

In [170]:
marks[marks["statistics"]>15][["statistics","probability"]]

Unnamed: 0,statistics,probability
abdessamad,16,14
taha,17,13


In [172]:
print(marks)

            advanced Liner algebra  statistics  probability  Data analysis  ML
abdessamad                      15          16           14             17  13
youssef                         12          13           14             12  15
taha                            14          17           13             16  16
nabil                           14          14           12             16  18
saad                            10          11           14             10  13


In [174]:
#let's try to select student that are brilliant in both of statistics and probability 
marks[(marks["statistics"]>13) & (marks["probability"]>13)]

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
abdessamad,15,16,14,17,13


In [176]:
#let's try to select student that are brilliant in statistics or probability 
marks[(marks["statistics"]>15) | (marks["probability"]>15)]

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
abdessamad,15,16,14,17,13
taha,14,17,13,16,16


In [191]:
#let's look at how we could change our our indexes into a column
students_marks = marks.reset_index()
students_marks

Unnamed: 0,index,advanced Liner algebra,statistics,probability,Data analysis,ML
0,abdessamad,15,16,14,17,13
1,youssef,12,13,14,12,15
2,taha,14,17,13,16,16
3,nabil,14,14,12,16,18
4,saad,10,11,14,10,13


In [192]:
#let's try to rename that column
students_marks.rename(columns={"index":"students"})

Unnamed: 0,students,advanced Liner algebra,statistics,probability,Data analysis,ML
0,abdessamad,15,16,14,17,13
1,youssef,12,13,14,12,15
2,taha,14,17,13,16,16
3,nabil,14,14,12,16,18
4,saad,10,11,14,10,13


In [193]:
#we can also rename indexes
students_marks.rename(index={0:"A",1:"B",2:"C",3:"D",4:"E"})

Unnamed: 0,index,advanced Liner algebra,statistics,probability,Data analysis,ML
A,abdessamad,15,16,14,17,13
B,youssef,12,13,14,12,15
C,taha,14,17,13,16,16
D,nabil,14,14,12,16,18
E,saad,10,11,14,10,13


In [194]:
#let's try now to rename both rows and columns in the very same line and put inplace=True so that we can keep the changement
students_marks.rename(columns={"index":"students"},index={0:"A",1:"B",2:"C",3:"D",4:"E"},inplace=True)
students_marks

Unnamed: 0,students,advanced Liner algebra,statistics,probability,Data analysis,ML
A,abdessamad,15,16,14,17,13
B,youssef,12,13,14,12,15
C,taha,14,17,13,16,16
D,nabil,14,14,12,16,18
E,saad,10,11,14,10,13


In [195]:
#let's see how we could make a column as an index ,the best column in our table suited for indexing is students
students_marks.set_index("students",inplace=True)
students_marks

Unnamed: 0_level_0,advanced Liner algebra,statistics,probability,Data analysis,ML
students,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abdessamad,15,16,14,17,13
youssef,12,13,14,12,15
taha,14,17,13,16,16
nabil,14,14,12,16,18
saad,10,11,14,10,13
