# Pandas
"pandas is a software library written for the Python programming language for data manipulation and analysis. In particular, it offers data structures and operations for manipulating numerical tables and time series." Wikipedia

In [301]:
#we import numpy and pandas
import numpy as np
import pandas as pd

# Series
we can see a pandas serie as a sort of One dimensional array with labels

In [302]:
indexes = [1,2,3]
data = np.array(["abdessamad","youssef","nabil"])

In [303]:
#let's create a pandas serie based on our numpy array
serie_1 = pd.Series(data)
serie_1

0    abdessamad
1       youssef
2         nabil
dtype: object

In [304]:
#let's retrieve values from the serie based on indexes
print(serie_1[0],serie_1[1],serie_1[2])

abdessamad youssef nabil


In [305]:
#let's create a pandas serie based on our numpy array and by adding the parameter indexes 
serie_2 = pd.Series(data,index=indexes)
serie_2

1    abdessamad
2       youssef
3         nabil
dtype: object

In [306]:
#we can also create a pandas serie based on a dictionnary and we get the same result
dico = {"One":"abdessamad","Two":"youssef","Three":"nabil"}
serie_3 = pd.Series(dico)
serie_3

One      abdessamad
Two         youssef
Three         nabil
dtype: object

In [307]:
#let's retrieve values from the serie based on the indexes we chose
serie_3["One"],serie_3["Two"],serie_3["Three"]

('abdessamad', 'youssef', 'nabil')

In [308]:
# a series can hold any type of data 
pd.Series(data=indexes)

0    1
1    2
2    3
dtype: int64

In [309]:
#we can even pass in built in function 
pd.Series(data=[min,max])

0    <built-in function min>
1    <built-in function max>
dtype: object

In [310]:
#we create a pandas serie containning marks of the first part of an exam,as we we can see in the parameters the order does not matter if we specify the name of the parameter
marks_first_part_exam = pd.Series(index=["abdessamad","youssef","nabil"],data=[9,7,6])
marks_first_part_exam

abdessamad    9
youssef       7
nabil         6
dtype: int64

In [311]:
#we create a pandas serie containning marks of the second part of an exam,we dont have to name the parameters but the order will matter if we do so
marks_second_part_exam = pd.Series([9,5,9],["abdessamad","youssef","driss"])
marks_second_part_exam

abdessamad    9
youssef       5
driss         9
dtype: int64

In [312]:
#we can see when an index is present in both pandas series we just add the value corresponding to that index,but if an index is present in one serie and not in the other serie we get a nan value 
marks_first_part_exam + marks_second_part_exam

abdessamad    18.0
driss          NaN
nabil          NaN
youssef       12.0
dtype: float64

# DataFrame
we can see dataframes as two dimensional labaled arrays 

In [385]:
#let's create a dataframe of the mathematician that won the fiels medal in 2006 and 2010 based on a dictionary 
Fiels_Medal_2006_2010 = {"Country":["Russia","Russia","Australia","France","Israel","France","Russia","France"],
                         "Mathematician":["Andrei Okounkov ","Grigori Perelman ","Terence Tao ","Wendelin Werner ","Elon Lindenstrauss ","Ngô Bảo Châu ","Stanislav Smirnov "," Cédric Villani"],
                          "population in millions":[144.5,144.5,66.99,24.99,8.884,66.99,144.5,66.99]   
                        }
pd.DataFrame(Fiels_Medal_2006_2010)

Unnamed: 0,Country,Mathematician,population in millions
0,Russia,Andrei Okounkov,144.5
1,Russia,Grigori Perelman,144.5
2,Australia,Terence Tao,66.99
3,France,Wendelin Werner,24.99
4,Israel,Elon Lindenstrauss,8.884
5,France,Ngô Bảo Châu,66.99
6,Russia,Stanislav Smirnov,144.5
7,France,Cédric Villani,66.99


In [313]:
#this time we create a dataframe based on a numpy array
data_frame = pd.DataFrame(data=np.random.randint(12,18,(4,4)),columns=["advanced Liner algebra","statistics","probability","Data analysis"],index=["abdessamad","youssef","taha","nabil"])
print(data_frame,"\n\n"+str(type(data_frame)))

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      16          15           14             16
youssef                         15          16           17             17
taha                            16          12           12             13
nabil                           15          15           13             12 

<class 'pandas.core.frame.DataFrame'>


In [314]:
#each column of the dataframe is a pandas serie
serie1 = data_frame["advanced Liner algebra"]
print(serie1,"\n\n"+str(type(serie1)))

abdessamad    16
youssef       15
taha          16
nabil         15
Name: advanced Liner algebra, dtype: int64 

<class 'pandas.core.series.Series'>


In [315]:
#let's get multiple columns by passing on a list of the labels of colums,once we retrieve more than one column we will get a pandas dataframe and not a serie 
print(data_frame[["advanced Liner algebra","statistics"]],"\n\n"+str(type(data_frame[["advanced Liner algebra","statistics"]])))



            advanced Liner algebra  statistics
abdessamad                      16          15
youssef                         15          16
taha                            16          12
nabil                           15          15 

<class 'pandas.core.frame.DataFrame'>


In [316]:
#let's add some columns to our dataframe
data_frame["ML"] = np.random.randint(10,18,4)
print(data_frame)

            advanced Liner algebra  statistics  probability  Data analysis  ML
abdessamad                      16          15           14             16  14
youssef                         15          16           17             17  15
taha                            16          12           12             13  14
nabil                           15          15           13             12  14


In [317]:
#now let's remove a column,axis=1 allows us to tell pandas that "saad" is present on the columns,inplace=True allows us to modify the dataframe permanently
data_frame.drop("ML",axis=1,inplace=True)
print(data_frame)

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      16          15           14             16
youssef                         15          16           17             17
taha                            16          12           12             13
nabil                           15          15           13             12


In [318]:
#let's select rows from our dataframe,the loc method allows us to retrieve rows and columns by label
data_frame.loc["abdessamad"]

advanced Liner algebra    16
statistics                15
probability               14
Data analysis             16
Name: abdessamad, dtype: int64

In [319]:
#now we select rows using the iloc method,wich is based on the location and not a label
print(data_frame.iloc[0],"\n\n"+str(type(data_frame.iloc[0])))

advanced Liner algebra    16
statistics                15
probability               14
Data analysis             16
Name: abdessamad, dtype: int64 

<class 'pandas.core.series.Series'>


In [320]:
#we select multiple rows by passing the labels in a list
print(data_frame.loc[["abdessamad","nabil"]])

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      16          15           14             16
nabil                           15          15           13             12


In [321]:
#we select multiple rows by passing a list integers corresponding to the location of the row 
print(data_frame.iloc[[0,1]])

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      16          15           14             16
youssef                         15          16           17             17


In [322]:
#let's add a new row to our dataframe
data_frame.loc["saad"] = np.random.randint(10,18,4)
print(data_frame)

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      16          15           14             16
youssef                         15          16           17             17
taha                            16          12           12             13
nabil                           15          15           13             12
saad                            14          17           13             14


In [323]:
#now let's drop a row from our pandas dataframe,axis=0 allows us to specify that we want to delete a row and inplace allows us to make a permanent delete in our dataframe
data_frame.drop("saad",axis=0,inplace=True)
print(data_frame)

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      16          15           14             16
youssef                         15          16           17             17
taha                            16          12           12             13
nabil                           15          15           13             12


In [324]:
#let's look on how we can subset rows and columns
#first method ,label based
print(data_frame.loc[["nabil","taha"] ,["statistics","probability"]])

       statistics  probability
nabil          15           13
taha           12           12


In [325]:
#second methid ,index based
print(data_frame.iloc[[2,3],[1,3]])

       statistics  Data analysis
taha           12             13
nabil          15             12


In [326]:
#new we will look at conditional selection

In [327]:
marks = data_frame
marks["ML"] = np.random.randint(10,20,4)
marks.loc["saad"] = np.random.randint(10,18,5)

In [328]:
print(marks)

            advanced Liner algebra  statistics  probability  Data analysis  ML
abdessamad                      16          15           14             16  13
youssef                         15          16           17             17  19
taha                            16          12           12             13  10
nabil                           15          15           13             12  15
saad                            14          15           15             14  11


In [329]:
#let's look at that marks that are superior to 15,where we have true the mark is superior to 15
marks>15

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
abdessamad,True,False,False,True,False
youssef,False,True,True,True,True
taha,True,False,False,False,False
nabil,False,False,False,False,False
saad,False,False,False,False,False


In [330]:
#we selected the marks that are superior to 15 
marks[marks>15]

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
abdessamad,16.0,,,16.0,
youssef,,16.0,17.0,17.0,19.0
taha,16.0,,,,
nabil,,,,,
saad,,,,,


In [331]:
#let's see the marks that are superior to 15 in the subject statistics ,we will only be filtring the subject of statistic
marks["statistics"]>15

abdessamad    False
youssef        True
taha          False
nabil         False
saad          False
Name: statistics, dtype: bool

In [332]:
#we only get the students that got a mark superior to 15 in statistics
marks[marks["statistics"]>15]

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
youssef,15,16,17,17,19


In [333]:
marks[marks["statistics"]>15]["statistics"]

youssef    16
Name: statistics, dtype: int64

In [334]:
marks[marks["statistics"]>15][["statistics","probability"]]

Unnamed: 0,statistics,probability
youssef,16,17


In [335]:
print(marks)

            advanced Liner algebra  statistics  probability  Data analysis  ML
abdessamad                      16          15           14             16  13
youssef                         15          16           17             17  19
taha                            16          12           12             13  10
nabil                           15          15           13             12  15
saad                            14          15           15             14  11


In [336]:
#let's try to select student that are brilliant in both of statistics and probability 
marks[(marks["statistics"]>13) & (marks["probability"]>13)]

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
abdessamad,16,15,14,16,13
youssef,15,16,17,17,19
saad,14,15,15,14,11


In [337]:
#let's try to select student that are brilliant in statistics or probability 
marks[(marks["statistics"]>15) | (marks["probability"]>15)]

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
youssef,15,16,17,17,19


In [338]:
#let's look at how we could change our our indexes into a column
students_marks = marks.reset_index()
students_marks

Unnamed: 0,index,advanced Liner algebra,statistics,probability,Data analysis,ML
0,abdessamad,16,15,14,16,13
1,youssef,15,16,17,17,19
2,taha,16,12,12,13,10
3,nabil,15,15,13,12,15
4,saad,14,15,15,14,11


In [339]:
#let's try to rename that column
students_marks.rename(columns={"index":"students"})

Unnamed: 0,students,advanced Liner algebra,statistics,probability,Data analysis,ML
0,abdessamad,16,15,14,16,13
1,youssef,15,16,17,17,19
2,taha,16,12,12,13,10
3,nabil,15,15,13,12,15
4,saad,14,15,15,14,11


In [340]:
#we can also rename indexes
students_marks.rename(index={0:"A",1:"B",2:"C",3:"D",4:"E"})

Unnamed: 0,index,advanced Liner algebra,statistics,probability,Data analysis,ML
A,abdessamad,16,15,14,16,13
B,youssef,15,16,17,17,19
C,taha,16,12,12,13,10
D,nabil,15,15,13,12,15
E,saad,14,15,15,14,11


In [341]:
#let's try now to rename both rows and columns in the very same line and put inplace=True so that we can keep the changement
students_marks.rename(columns={"index":"students"},index={0:"A",1:"B",2:"C",3:"D",4:"E"},inplace=True)
students_marks

Unnamed: 0,students,advanced Liner algebra,statistics,probability,Data analysis,ML
A,abdessamad,16,15,14,16,13
B,youssef,15,16,17,17,19
C,taha,16,12,12,13,10
D,nabil,15,15,13,12,15
E,saad,14,15,15,14,11


In [342]:
#let's see how we could make a column as an index ,the best column in our table suited for indexing is students
students_marks.set_index("students",inplace=True)
students_marks

Unnamed: 0_level_0,advanced Liner algebra,statistics,probability,Data analysis,ML
students,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abdessamad,16,15,14,16,13
youssef,15,16,17,17,19
taha,16,12,12,13,10
nabil,15,15,13,12,15
saad,14,15,15,14,11


In [343]:
#Multi-indexing

In [344]:
#let's look at how we could create a multi-index dataframe
first_layer = ["team_A","team_A","team_B","team_B","team_C","team_C"]
second_layer = ["Abdessamad","Noah","Benjamin","Logan","Ava","Adam"]
index = list(zip(first_layer,second_layer))
index = pd.MultiIndex.from_tuples(index)

In [345]:
index.reindex(index)

(MultiIndex([('team_A', 'Abdessamad'),
             ('team_A',       'Noah'),
             ('team_B',   'Benjamin'),
             ('team_B',      'Logan'),
             ('team_C',        'Ava'),
             ('team_C',       'Adam')],
            ),
 None)

In [346]:
project_mark = pd.DataFrame(np.random.randint(10,15,(6,2)),index,["Presentation","Report"])
project_mark

Unnamed: 0,Unnamed: 1,Presentation,Report
team_A,Abdessamad,10,10
team_A,Noah,12,14
team_B,Benjamin,10,14
team_B,Logan,13,10
team_C,Ava,11,11
team_C,Adam,11,10


In [347]:
#let's retrieve team_A
project_mark.loc["team_A"]

Unnamed: 0,Presentation,Report
Abdessamad,10,10
Noah,12,14


In [348]:
#let's retrieve an element of team A
project_mark.loc["team_A"].loc["Abdessamad"]

Presentation    10
Report          10
Name: Abdessamad, dtype: int64

In [349]:
#let's now try to retrieve the mark of the student abdessamad in Presentation part
project_mark.loc["team_A"].loc["Abdessamad"]["Presentation"]

10

In [350]:
#let's name our indexes
project_mark.index.names = ["Team","Student"]
project_mark

Unnamed: 0_level_0,Unnamed: 1_level_0,Presentation,Report
Team,Student,Unnamed: 2_level_1,Unnamed: 3_level_1
team_A,Abdessamad,10,10
team_A,Noah,12,14
team_B,Benjamin,10,14
team_B,Logan,13,10
team_C,Ava,11,11
team_C,Adam,11,10


In [351]:
project_mark.xs("team_A",level="Team")

Unnamed: 0_level_0,Presentation,Report
Student,Unnamed: 1_level_1,Unnamed: 2_level_1
Abdessamad,10,10
Noah,12,14


In [352]:
project_mark.xs("Abdessamad",level="Student")

Unnamed: 0_level_0,Presentation,Report
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
team_A,10,10


In [353]:
print(project_mark.xs("Abdessamad",level="Student")["Presentation"],project_mark.xs("Abdessamad",level="Student")["Report"])


Team
team_A    10
Name: Presentation, dtype: int64 Team
team_A    10
Name: Report, dtype: int64


In [354]:
#missing data

In [355]:
marks_arr = [np.random.randint(10,18,5),
             np.append(np.nan,np.random.randint(10,18,4)),
             np.random.randint(10,18,4),             
             np.append(np.nan,np.random.randint(10,18,3))
            ]

In [356]:
homework = pd.DataFrame(marks_arr)
homework

Unnamed: 0,0,1,2,3,4
0,15.0,16.0,17.0,14.0,17.0
1,,15.0,16.0,16.0,17.0
2,17.0,15.0,16.0,11.0,
3,,13.0,13.0,14.0,


In [357]:
homework.columns = ["assignment1","assignment2","assignment3","assignment4","assignment5"]
homework.index = ["Abdessamad","Noah","Benjamin","Logan"]
homework.index.name = "Student"

In [358]:
#this data table shows us marks of homeworks assignment that student had to work on during the semester 
#students are allowed are only obliged to work on three assignments during the semester ,a nan value means a studnet chose to not work on that assignment
homework

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,15.0,16.0,17.0,14.0,17.0
Noah,,15.0,16.0,16.0,17.0
Benjamin,17.0,15.0,16.0,11.0,
Logan,,13.0,13.0,14.0,


In [359]:
#let's drop any row that has a nan value,we can see that only one student went the extra mile and chose to work on all assignment
homework.dropna(axis=0)

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,15.0,16.0,17.0,14.0,17.0


In [360]:
#let's drop any column that has a nan value,we can see that all student chose to work on assignment 2,3,4
homework.dropna(axis=1)

Unnamed: 0_level_0,assignment2,assignment3,assignment4
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abdessamad,16.0,17.0,14.0
Noah,15.0,16.0,16.0
Benjamin,15.0,16.0,11.0
Logan,13.0,13.0,14.0


In [361]:
#we add a new students to our data table
n = [np.nan,np.nan,np.nan]
homework.loc["Adam"] = np.append(n,np.random.randint(10,16,2))
homework.loc["Anya"] = np.append(n,np.random.randint(10,16,2))
homework.loc["Zita"] = np.append(n,np.random.randint(10,16,2))
homework.loc["Mehdi"] = np.append(np.random.randint(10,16,4),np.nan)
homework.loc["Mike"] = np.append(np.random.randint(10,16,4),np.nan)
homework.loc["David"] = np.append(np.nan,np.random.randint(10,16,4))
homework.loc["jonathan"] = np.append(np.nan,np.random.randint(10,16,4))
homework

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,15.0,16.0,17.0,14.0,17.0
Noah,,15.0,16.0,16.0,17.0
Benjamin,17.0,15.0,16.0,11.0,
Logan,,13.0,13.0,14.0,
Adam,,,,15.0,12.0
Anya,,,,11.0,11.0
Zita,,,,15.0,11.0
Mehdi,13.0,10.0,10.0,11.0,
Mike,10.0,12.0,12.0,12.0,
David,,15.0,12.0,15.0,11.0


In [362]:
#during a semester student are obliged to work on at least three assignment if not they will fail the subject
#by observing our data table we can see that some students worked on only two projects so they failed the subject
#let's drop all students that failed the subject from our data table
#thresh=3 means each individual require at least 3(assignment) non nan value to net get droped from out data table
homework.dropna(axis=0,thresh=3,inplace=True)
homework

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,15.0,16.0,17.0,14.0,17.0
Noah,,15.0,16.0,16.0,17.0
Benjamin,17.0,15.0,16.0,11.0,
Logan,,13.0,13.0,14.0,
Mehdi,13.0,10.0,10.0,11.0,
Mike,10.0,12.0,12.0,12.0,
David,,15.0,12.0,15.0,11.0
jonathan,,14.0,12.0,15.0,11.0


In [363]:
#The professors thought that some assignments maybe too hard ,they decided to get rid of those assignments so that 
#next year students wont have to struggle with them
homework.dropna(axis=1,thresh=5)

Unnamed: 0_level_0,assignment2,assignment3,assignment4
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abdessamad,16.0,17.0,14.0
Noah,15.0,16.0,16.0
Benjamin,15.0,16.0,11.0
Logan,13.0,13.0,14.0
Mehdi,10.0,10.0,11.0
Mike,12.0,12.0,12.0
David,15.0,12.0,15.0
jonathan,14.0,12.0,15.0


In [364]:
#let's go back to our data table
homework

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,15.0,16.0,17.0,14.0,17.0
Noah,,15.0,16.0,16.0,17.0
Benjamin,17.0,15.0,16.0,11.0,
Logan,,13.0,13.0,14.0,
Mehdi,13.0,10.0,10.0,11.0,
Mike,10.0,12.0,12.0,12.0,
David,,15.0,12.0,15.0,11.0
jonathan,,14.0,12.0,15.0,11.0


In [365]:
#now we will calculate the final mark of each student 
#by observing our dataframe we can see that we have some missing values of some assignment that werent done by students
#the assignment that students chose not work on will just get the mark 10
homework.fillna(value=10)
homework

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,15.0,16.0,17.0,14.0,17.0
Noah,,15.0,16.0,16.0,17.0
Benjamin,17.0,15.0,16.0,11.0,
Logan,,13.0,13.0,14.0,
Mehdi,13.0,10.0,10.0,11.0,
Mike,10.0,12.0,12.0,12.0,
David,,15.0,12.0,15.0,11.0
jonathan,,14.0,12.0,15.0,11.0


In [366]:
homework_copy = homework.copy()

In [367]:
#let's fill in the missing values by the mean of the marks they got on the other assignments
for student in homework.index:
    homework.loc[student].fillna(value=np.mean(homework.loc[student]),inplace=True)
homework    

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,15.0,16.0,17.0,14.0,17.0
Noah,16.0,15.0,16.0,16.0,17.0
Benjamin,17.0,15.0,16.0,11.0,14.75
Logan,13.333333,13.0,13.0,14.0,13.333333
Mehdi,13.0,10.0,10.0,11.0,11.0
Mike,10.0,12.0,12.0,12.0,11.5
David,13.25,15.0,12.0,15.0,11.0
jonathan,13.0,14.0,12.0,15.0,11.0


In [368]:
#let's fill in the missing values by the mean of the marks they got on the other assignments,this code is equivalent
#to the loop we used 
homework_copy.loc["Noah"].fillna(value=np.mean(homework_copy.loc["Noah"]),inplace=True)
homework_copy.loc["Benjamin"].fillna(value=np.mean(homework_copy.loc["Benjamin"]),inplace=True)
homework_copy.loc["Logan"].fillna(value=np.mean(homework_copy.loc["Logan"]),inplace=True)
homework_copy.loc["Mehdi"].fillna(value=np.mean(homework_copy.loc["Mehdi"]),inplace=True)
homework_copy.loc["David"].fillna(value=np.mean(homework_copy.loc["David"]),inplace=True)
homework_copy.loc["Mike"].fillna(value=np.mean(homework_copy.loc["Mike"]),inplace=True)
homework_copy.loc["jonathan"].fillna(value=np.mean(homework_copy.loc["jonathan"]),inplace=True)

In [369]:
print(homework_copy == homework)

            assignment1  assignment2  assignment3  assignment4  assignment5
Student                                                                    
Abdessamad         True         True         True         True         True
Noah               True         True         True         True         True
Benjamin           True         True         True         True         True
Logan              True         True         True         True         True
Mehdi              True         True         True         True         True
Mike               True         True         True         True         True
David              True         True         True         True         True
jonathan           True         True         True         True         True


In [370]:
#Aggregation

In [408]:
#Schools = ["School_"+str(i) for i in range(1,5) ]
TOEIC_Score = pd.DataFrame({"School":np.append(["School_"+str(i) for i in range(1,5)],["School_"+str(i) for i in range(4,0,-1)]),
                          "major":["english","engineering","business","english","english","engineering","business","english"],
                          "Student":["Logan","David","jonathan","Noah","abdessamad","nabil","driss","youssef"],
                           "Score":np.random.randint(700,990,8)})
TOEIC_Score                        

Unnamed: 0,School,major,Student,Score
0,School_1,english,Logan,707
1,School_2,engineering,David,702
2,School_3,business,jonathan,865
3,School_4,english,Noah,971
4,School_4,english,abdessamad,962
5,School_3,engineering,nabil,702
6,School_2,business,driss,720
7,School_1,english,youssef,740


In [410]:
#let's grouby the column shcool ,of each school two students took the TOEIC test
TOEIC_Score_by_School = TOEIC_Score.groupby("School")

In [411]:
#the mean of the TOEIC test of each school ,we can clearly see that the columns major and student were ignored 
#the fact that they are not numerical features
TOEIC_Score_by_School.mean()

Unnamed: 0_level_0,Score
School,Unnamed: 1_level_1
School_1,723.5
School_2,711.0
School_3,783.5
School_4,966.5


In [412]:
TOEIC_Score_by_School.sum()

Unnamed: 0_level_0,Score
School,Unnamed: 1_level_1
School_1,1447
School_2,1422
School_3,1567
School_4,1933


In [414]:
TOEIC_Score_by_School.sum().loc["School_2"]

Score    1422
Name: School_2, dtype: int64

In [416]:
TOEIC_Score_by_School.count()

Unnamed: 0_level_0,major,Student,Score
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
School_1,2,2,2
School_2,2,2,2
School_3,2,2,2
School_4,2,2,2


In [413]:
TOEIC_Score_by_School.std()

Unnamed: 0_level_0,Score
School,Unnamed: 1_level_1
School_1,23.334524
School_2,12.727922
School_3,115.258405
School_4,6.363961


In [417]:
TOEIC_Score_by_School.max()

Unnamed: 0_level_0,major,Student,Score
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
School_1,english,youssef,740
School_2,engineering,driss,720
School_3,engineering,nabil,865
School_4,english,abdessamad,971


In [421]:
TOEIC_Score_by_School.min()

Unnamed: 0_level_0,major,Student,Score
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
School_1,english,Logan,707
School_2,business,David,702
School_3,business,jonathan,702
School_4,english,Noah,962


In [419]:
TOEIC_Score_by_School["Score"].max()

School
School_1    740
School_2    720
School_3    865
School_4    971
Name: Score, dtype: int64

In [420]:
TOEIC_Score_by_School["Score"].min()

School
School_1    707
School_2    702
School_3    702
School_4    962
Name: Score, dtype: int64