# Pandas
"pandas is a software library written for the Python programming language for data manipulation and analysis. In particular, it offers data structures and operations for manipulating numerical tables and time series." Wikipedia

In [737]:
#we import numpy and pandas
import numpy as np
import pandas as pd

# Series
we can see a pandas serie as a sort of One dimensional array with labels

In [738]:
indexes = [1,2,3]
data = np.array(["abdessamad","youssef","nabil"])

In [739]:
#let's create a pandas serie based on our numpy array
serie_1 = pd.Series(data)
serie_1

0    abdessamad
1       youssef
2         nabil
dtype: object

In [740]:
#let's retrieve values from the serie based on indexes
print(serie_1[0],serie_1[1],serie_1[2])

abdessamad youssef nabil


In [741]:
#let's create a pandas serie based on our numpy array and by adding the parameter indexes 
serie_2 = pd.Series(data,index=indexes)
serie_2

1    abdessamad
2       youssef
3         nabil
dtype: object

In [742]:
#we can also create a pandas serie based on a dictionnary and we get the same result
dico = {"One":"abdessamad","Two":"youssef","Three":"nabil"}
serie_3 = pd.Series(dico)
serie_3

One      abdessamad
Two         youssef
Three         nabil
dtype: object

In [743]:
#let's retrieve values from the serie based on the indexes we chose
serie_3["One"],serie_3["Two"],serie_3["Three"]

('abdessamad', 'youssef', 'nabil')

In [744]:
# a series can hold any type of data 
pd.Series(data=indexes)

0    1
1    2
2    3
dtype: int64

In [745]:
#we can even pass in built in function 
pd.Series(data=[min,max])

0    <built-in function min>
1    <built-in function max>
dtype: object

In [746]:
#we create a pandas serie containning marks of the first part of an exam,as we we can see in the parameters the order does not matter if we specify the name of the parameter
marks_first_part_exam = pd.Series(index=["abdessamad","youssef","nabil"],data=[9,7,6])
marks_first_part_exam

abdessamad    9
youssef       7
nabil         6
dtype: int64

In [747]:
#we create a pandas serie containning marks of the second part of an exam,we dont have to name the parameters but the order will matter if we do so
marks_second_part_exam = pd.Series([9,5,9],["abdessamad","youssef","driss"])
marks_second_part_exam

abdessamad    9
youssef       5
driss         9
dtype: int64

In [748]:
#we can see when an index is present in both pandas series we just add the value corresponding to that index,but if an index is present in one serie and not in the other serie we get a nan value 
marks_first_part_exam + marks_second_part_exam

abdessamad    18.0
driss          NaN
nabil          NaN
youssef       12.0
dtype: float64

# DataFrame
we can see dataframes as two dimensional labaled arrays 

In [749]:
#let's create a dataframe of the mathematician that won the fiels medal in 2006 and 2010 based on a dictionary 
Fiels_Medal_2006_2010 = {"Country":["Russia","Russia","Australia","France","Israel","France","Russia","France"],
                         "Mathematician":["Andrei Okounkov ","Grigori Perelman ","Terence Tao ","Wendelin Werner ","Elon Lindenstrauss ","Ngô Bảo Châu ","Stanislav Smirnov "," Cédric Villani"],
                          "population_in_millions":[144.5,144.5,24.99,66.99,8.884,66.99,144.5,66.99]   
                        }
df = pd.DataFrame(Fiels_Medal_2006_2010)
df

Unnamed: 0,Country,Mathematician,population_in_millions
0,Russia,Andrei Okounkov,144.5
1,Russia,Grigori Perelman,144.5
2,Australia,Terence Tao,24.99
3,France,Wendelin Werner,66.99
4,Israel,Elon Lindenstrauss,8.884
5,France,Ngô Bảo Châu,66.99
6,Russia,Stanislav Smirnov,144.5
7,France,Cédric Villani,66.99


In [750]:
#the attribute shape allows us to know the number of row and columns (8,3)->(8 rows,3 columns)
df.shape

(8, 3)

In [751]:
df.columns

Index(['Country', 'Mathematician', 'population_in_millions'], dtype='object')

In [752]:
#let's look at the type of our data frame columns
df.dtypes

Country                    object
Mathematician              object
population_in_millions    float64
dtype: object

In [753]:
#ley's look at uniques values of a certain column of our dataframe
df["Country"].unique()

array(['Russia', 'Australia', 'France', 'Israel'], dtype=object)

In [754]:
df.groupby(['Country','population_in_millions']).size()

Country    population_in_millions
Australia  24.990                    1
France     66.990                    3
Israel     8.884                     1
Russia     144.500                   3
dtype: int64

In [755]:
#this time we create a dataframe based on a numpy array
data_frame = pd.DataFrame(data=np.random.randint(12,18,(4,4)),columns=["advanced Liner algebra","statistics","probability","Data analysis"],index=["abdessamad","youssef","taha","nabil"])
print(data_frame,"\n\n"+str(type(data_frame)))

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      14          12           15             15
youssef                         13          14           15             15
taha                            13          17           15             12
nabil                           12          13           15             12 

<class 'pandas.core.frame.DataFrame'>


In [756]:
#each column of the dataframe is a pandas serie
serie1 = data_frame["advanced Liner algebra"]
print(serie1,"\n\n"+str(type(serie1)))

abdessamad    14
youssef       13
taha          13
nabil         12
Name: advanced Liner algebra, dtype: int64 

<class 'pandas.core.series.Series'>


In [757]:
#let's get multiple columns by passing on a list of the labels of colums,once we retrieve more than one column we will get a pandas dataframe and not a serie 
print(data_frame[["advanced Liner algebra","statistics"]],"\n\n"+str(type(data_frame[["advanced Liner algebra","statistics"]])))



            advanced Liner algebra  statistics
abdessamad                      14          12
youssef                         13          14
taha                            13          17
nabil                           12          13 

<class 'pandas.core.frame.DataFrame'>


In [758]:
#let's add some columns to our dataframe
data_frame["ML"] = np.random.randint(10,18,4)
print(data_frame)

            advanced Liner algebra  statistics  probability  Data analysis  ML
abdessamad                      14          12           15             15  16
youssef                         13          14           15             15  10
taha                            13          17           15             12  14
nabil                           12          13           15             12  14


In [759]:
#now let's remove a column,axis=1 allows us to tell pandas that "saad" is present on the columns,inplace=True allows us to modify the dataframe permanently
data_frame.drop("ML",axis=1,inplace=True)
print(data_frame)

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      14          12           15             15
youssef                         13          14           15             15
taha                            13          17           15             12
nabil                           12          13           15             12


In [760]:
#let's select rows from our dataframe,the loc method allows us to retrieve rows and columns by label
data_frame.loc["abdessamad"]

advanced Liner algebra    14
statistics                12
probability               15
Data analysis             15
Name: abdessamad, dtype: int64

In [761]:
#now we select rows using the iloc method,wich is based on the location and not a label
print(data_frame.iloc[0],"\n\n"+str(type(data_frame.iloc[0])))

advanced Liner algebra    14
statistics                12
probability               15
Data analysis             15
Name: abdessamad, dtype: int64 

<class 'pandas.core.series.Series'>


In [762]:
#we select multiple rows by passing the labels in a list
print(data_frame.loc[["abdessamad","nabil"]])

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      14          12           15             15
nabil                           12          13           15             12


In [763]:
#we select multiple rows by passing a list integers corresponding to the location of the row 
print(data_frame.iloc[[0,1]])

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      14          12           15             15
youssef                         13          14           15             15


In [764]:
#let's add a new row to our dataframe
data_frame.loc["saad"] = np.random.randint(10,18,4)
print(data_frame)

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      14          12           15             15
youssef                         13          14           15             15
taha                            13          17           15             12
nabil                           12          13           15             12
saad                            11          11           12             10


In [765]:
#now let's drop a row from our pandas dataframe,axis=0 allows us to specify that we want to delete a row and inplace allows us to make a permanent delete in our dataframe
data_frame.drop("saad",axis=0,inplace=True)
print(data_frame)

            advanced Liner algebra  statistics  probability  Data analysis
abdessamad                      14          12           15             15
youssef                         13          14           15             15
taha                            13          17           15             12
nabil                           12          13           15             12


In [766]:
#let's look on how we can subset rows and columns
#first method ,label based
print(data_frame.loc[["nabil","taha"] ,["statistics","probability"]])

       statistics  probability
nabil          13           15
taha           17           15


In [767]:
#second methid ,index based
print(data_frame.iloc[[2,3],[1,3]])

       statistics  Data analysis
taha           17             12
nabil          13             12


In [768]:
#new we will look at conditional selection

In [769]:
marks = data_frame
marks["ML"] = np.random.randint(10,20,4)
marks.loc["saad"] = np.random.randint(10,18,5)

In [770]:
print(marks)

            advanced Liner algebra  statistics  probability  Data analysis  ML
abdessamad                      14          12           15             15  17
youssef                         13          14           15             15  15
taha                            13          17           15             12  14
nabil                           12          13           15             12  12
saad                            13          12           14             13  14


In [771]:
#let's look at that marks that are superior to 15,where we have true the mark is superior to 15
marks>15

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
abdessamad,False,False,False,False,True
youssef,False,False,False,False,False
taha,False,True,False,False,False
nabil,False,False,False,False,False
saad,False,False,False,False,False


In [772]:
#we selected the marks that are superior to 15 
marks[marks>15]

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
abdessamad,,,,,17.0
youssef,,,,,
taha,,17.0,,,
nabil,,,,,
saad,,,,,


In [773]:
#let's see the marks that are superior to 15 in the subject statistics ,we will only be filtring the subject of statistic
marks["statistics"]>15

abdessamad    False
youssef       False
taha           True
nabil         False
saad          False
Name: statistics, dtype: bool

In [774]:
#we only get the students that got a mark superior to 15 in statistics
marks[marks["statistics"]>15]

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
taha,13,17,15,12,14


In [775]:
marks[marks["statistics"]>15]["statistics"]

taha    17
Name: statistics, dtype: int64

In [776]:
marks[marks["statistics"]>15][["statistics","probability"]]

Unnamed: 0,statistics,probability
taha,17,15


In [777]:
print(marks)

            advanced Liner algebra  statistics  probability  Data analysis  ML
abdessamad                      14          12           15             15  17
youssef                         13          14           15             15  15
taha                            13          17           15             12  14
nabil                           12          13           15             12  12
saad                            13          12           14             13  14


In [778]:
#let's try to select student that are brilliant in both of statistics and probability 
marks[(marks["statistics"]>13) & (marks["probability"]>13)]

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
youssef,13,14,15,15,15
taha,13,17,15,12,14


In [779]:
#let's try to select student that are brilliant in statistics or probability 
marks[(marks["statistics"]>15) | (marks["probability"]>15)]

Unnamed: 0,advanced Liner algebra,statistics,probability,Data analysis,ML
taha,13,17,15,12,14


In [780]:
#let's look at how we could change our our indexes into a column
students_marks = marks.reset_index()
students_marks

Unnamed: 0,index,advanced Liner algebra,statistics,probability,Data analysis,ML
0,abdessamad,14,12,15,15,17
1,youssef,13,14,15,15,15
2,taha,13,17,15,12,14
3,nabil,12,13,15,12,12
4,saad,13,12,14,13,14


In [781]:
#let's try to rename that column
students_marks.rename(columns={"index":"students"})

Unnamed: 0,students,advanced Liner algebra,statistics,probability,Data analysis,ML
0,abdessamad,14,12,15,15,17
1,youssef,13,14,15,15,15
2,taha,13,17,15,12,14
3,nabil,12,13,15,12,12
4,saad,13,12,14,13,14


In [782]:
#we can also rename indexes
students_marks.rename(index={0:"A",1:"B",2:"C",3:"D",4:"E"})

Unnamed: 0,index,advanced Liner algebra,statistics,probability,Data analysis,ML
A,abdessamad,14,12,15,15,17
B,youssef,13,14,15,15,15
C,taha,13,17,15,12,14
D,nabil,12,13,15,12,12
E,saad,13,12,14,13,14


In [783]:
#let's try now to rename both rows and columns in the very same line and put inplace=True so that we can keep the changement
students_marks.rename(columns={"index":"students"},index={0:"A",1:"B",2:"C",3:"D",4:"E"},inplace=True)
students_marks

Unnamed: 0,students,advanced Liner algebra,statistics,probability,Data analysis,ML
A,abdessamad,14,12,15,15,17
B,youssef,13,14,15,15,15
C,taha,13,17,15,12,14
D,nabil,12,13,15,12,12
E,saad,13,12,14,13,14


In [784]:
#let's see how we could make a column as an index ,the best column in our table suited for indexing is students
students_marks.set_index("students",inplace=True)
students_marks

Unnamed: 0_level_0,advanced Liner algebra,statistics,probability,Data analysis,ML
students,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abdessamad,14,12,15,15,17
youssef,13,14,15,15,15
taha,13,17,15,12,14
nabil,12,13,15,12,12
saad,13,12,14,13,14


In [785]:
#Multi-indexing

In [786]:
#let's look at how we could create a multi-index dataframe
first_layer = ["team_A","team_A","team_B","team_B","team_C","team_C"]
second_layer = ["Abdessamad","Noah","Benjamin","Logan","Ava","Adam"]
index = list(zip(first_layer,second_layer))
index = pd.MultiIndex.from_tuples(index)

In [787]:
index.reindex(index)

(MultiIndex([('team_A', 'Abdessamad'),
             ('team_A',       'Noah'),
             ('team_B',   'Benjamin'),
             ('team_B',      'Logan'),
             ('team_C',        'Ava'),
             ('team_C',       'Adam')],
            ),
 None)

In [788]:
project_mark = pd.DataFrame(np.random.randint(10,15,(6,2)),index,["Presentation","Report"])
project_mark

Unnamed: 0,Unnamed: 1,Presentation,Report
team_A,Abdessamad,13,10
team_A,Noah,12,10
team_B,Benjamin,13,14
team_B,Logan,14,11
team_C,Ava,14,10
team_C,Adam,13,14


In [789]:
#let's retrieve team_A
project_mark.loc["team_A"]

Unnamed: 0,Presentation,Report
Abdessamad,13,10
Noah,12,10


In [790]:
#let's retrieve an element of team A
project_mark.loc["team_A"].loc["Abdessamad"]

Presentation    13
Report          10
Name: Abdessamad, dtype: int64

In [791]:
#let's now try to retrieve the mark of the student abdessamad in Presentation part
project_mark.loc["team_A"].loc["Abdessamad"]["Presentation"]

13

In [792]:
#let's name our indexes
project_mark.index.names = ["Team","Student"]
project_mark

Unnamed: 0_level_0,Unnamed: 1_level_0,Presentation,Report
Team,Student,Unnamed: 2_level_1,Unnamed: 3_level_1
team_A,Abdessamad,13,10
team_A,Noah,12,10
team_B,Benjamin,13,14
team_B,Logan,14,11
team_C,Ava,14,10
team_C,Adam,13,14


In [793]:
project_mark.xs("team_A",level="Team")

Unnamed: 0_level_0,Presentation,Report
Student,Unnamed: 1_level_1,Unnamed: 2_level_1
Abdessamad,13,10
Noah,12,10


In [794]:
project_mark.xs("Abdessamad",level="Student")

Unnamed: 0_level_0,Presentation,Report
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
team_A,13,10


In [795]:
print(project_mark.xs("Abdessamad",level="Student")["Presentation"],project_mark.xs("Abdessamad",level="Student")["Report"])


Team
team_A    13
Name: Presentation, dtype: int64 Team
team_A    10
Name: Report, dtype: int64


In [796]:
#missing data

In [797]:
marks_arr = [np.random.randint(10,18,5),
             np.append(np.nan,np.random.randint(10,18,4)),
             np.random.randint(10,18,4),             
             np.append(np.nan,np.random.randint(10,18,3))
            ]

In [798]:
homework = pd.DataFrame(marks_arr)
homework

Unnamed: 0,0,1,2,3,4
0,16.0,11.0,12.0,17.0,11.0
1,,16.0,16.0,15.0,13.0
2,15.0,11.0,12.0,13.0,
3,,13.0,14.0,15.0,


In [799]:
homework.columns = ["assignment1","assignment2","assignment3","assignment4","assignment5"]
homework.index = ["Abdessamad","Noah","Benjamin","Logan"]
homework.index.name = "Student"

In [800]:
#this data table shows us marks of homeworks assignment that student had to work on during the semester 
#students are allowed are only obliged to work on three assignments during the semester ,a nan value means a studnet chose to not work on that assignment
homework

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,16.0,11.0,12.0,17.0,11.0
Noah,,16.0,16.0,15.0,13.0
Benjamin,15.0,11.0,12.0,13.0,
Logan,,13.0,14.0,15.0,


In [801]:
#let's see the number of student that didnt work on assignment1
print(pd.isnull(homework["assignment1"]).sum())

2


In [802]:
#let's see the number of student that worked on assignment4
print(pd.notnull(homework["assignment4"]).sum())

4


In [803]:
#let's drop any row that has a nan value,we can see that only one student went the extra mile and chose to work on all assignment
homework.dropna(axis=0)

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,16.0,11.0,12.0,17.0,11.0


In [804]:
#let's drop any column that has a nan value,we can see that all student chose to work on assignment 2,3,4
homework.dropna(axis=1)

Unnamed: 0_level_0,assignment2,assignment3,assignment4
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abdessamad,11.0,12.0,17.0
Noah,16.0,16.0,15.0
Benjamin,11.0,12.0,13.0
Logan,13.0,14.0,15.0


In [805]:
#we add a new students to our data table
n = [np.nan,np.nan,np.nan]
homework.loc["Adam"] = np.append(n,np.random.randint(10,16,2))
homework.loc["Anya"] = np.append(n,np.random.randint(10,16,2))
homework.loc["Zita"] = np.append(n,np.random.randint(10,16,2))
homework.loc["Mehdi"] = np.append(np.random.randint(10,16,4),np.nan)
homework.loc["Mike"] = np.append(np.random.randint(10,16,4),np.nan)
homework.loc["David"] = np.append(np.nan,np.random.randint(10,16,4))
homework.loc["jonathan"] = np.append(np.nan,np.random.randint(10,16,4))
homework

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,16.0,11.0,12.0,17.0,11.0
Noah,,16.0,16.0,15.0,13.0
Benjamin,15.0,11.0,12.0,13.0,
Logan,,13.0,14.0,15.0,
Adam,,,,12.0,15.0
Anya,,,,15.0,11.0
Zita,,,,11.0,11.0
Mehdi,11.0,15.0,11.0,10.0,
Mike,14.0,14.0,14.0,13.0,
David,,14.0,11.0,13.0,13.0


In [806]:
#during a semester student are obliged to work on at least three assignment if not they will fail the subject
#by observing our data table we can see that some students worked on only two projects so they failed the subject
#let's drop all students that failed the subject from our data table
#thresh=3 means each individual require at least 3(assignment) non nan value to net get droped from out data table
homework.dropna(axis=0,thresh=3,inplace=True)
homework

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,16.0,11.0,12.0,17.0,11.0
Noah,,16.0,16.0,15.0,13.0
Benjamin,15.0,11.0,12.0,13.0,
Logan,,13.0,14.0,15.0,
Mehdi,11.0,15.0,11.0,10.0,
Mike,14.0,14.0,14.0,13.0,
David,,14.0,11.0,13.0,13.0
jonathan,,15.0,11.0,13.0,10.0


In [807]:
#The professors thought that some assignments maybe too hard ,they decided to get rid of those assignments so that 
#next year students wont have to struggle with them
homework.dropna(axis=1,thresh=5)

Unnamed: 0_level_0,assignment2,assignment3,assignment4
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abdessamad,11.0,12.0,17.0
Noah,16.0,16.0,15.0
Benjamin,11.0,12.0,13.0
Logan,13.0,14.0,15.0
Mehdi,15.0,11.0,10.0
Mike,14.0,14.0,13.0
David,14.0,11.0,13.0
jonathan,15.0,11.0,13.0


In [808]:
#let's go back to our data table
homework

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,16.0,11.0,12.0,17.0,11.0
Noah,,16.0,16.0,15.0,13.0
Benjamin,15.0,11.0,12.0,13.0,
Logan,,13.0,14.0,15.0,
Mehdi,11.0,15.0,11.0,10.0,
Mike,14.0,14.0,14.0,13.0,
David,,14.0,11.0,13.0,13.0
jonathan,,15.0,11.0,13.0,10.0


In [809]:
#now we will calculate the final mark of each student 
#by observing our dataframe we can see that we have some missing values of some assignment that werent done by students
#the assignment that students chose not work on will just get the mark 10
homework.fillna(value=10)
homework

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,16.0,11.0,12.0,17.0,11.0
Noah,,16.0,16.0,15.0,13.0
Benjamin,15.0,11.0,12.0,13.0,
Logan,,13.0,14.0,15.0,
Mehdi,11.0,15.0,11.0,10.0,
Mike,14.0,14.0,14.0,13.0,
David,,14.0,11.0,13.0,13.0
jonathan,,15.0,11.0,13.0,10.0


In [810]:
homework_copy = homework.copy()

In [811]:
#let's fill in the missing values by the mean of the marks they got on the other assignments
for student in homework.index:
    homework.loc[student].fillna(value=np.mean(homework.loc[student]),inplace=True)
homework    

Unnamed: 0_level_0,assignment1,assignment2,assignment3,assignment4,assignment5
Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abdessamad,16.0,11.0,12.0,17.0,11.0
Noah,15.0,16.0,16.0,15.0,13.0
Benjamin,15.0,11.0,12.0,13.0,12.75
Logan,14.0,13.0,14.0,15.0,14.0
Mehdi,11.0,15.0,11.0,10.0,11.75
Mike,14.0,14.0,14.0,13.0,13.75
David,12.75,14.0,11.0,13.0,13.0
jonathan,12.25,15.0,11.0,13.0,10.0


In [812]:
#let's fill in the missing values by the mean of the marks they got on the other assignments,this code is equivalent
#to the loop we used 
homework_copy.loc["Noah"].fillna(value=np.mean(homework_copy.loc["Noah"]),inplace=True)
homework_copy.loc["Benjamin"].fillna(value=np.mean(homework_copy.loc["Benjamin"]),inplace=True)
homework_copy.loc["Logan"].fillna(value=np.mean(homework_copy.loc["Logan"]),inplace=True)
homework_copy.loc["Mehdi"].fillna(value=np.mean(homework_copy.loc["Mehdi"]),inplace=True)
homework_copy.loc["David"].fillna(value=np.mean(homework_copy.loc["David"]),inplace=True)
homework_copy.loc["Mike"].fillna(value=np.mean(homework_copy.loc["Mike"]),inplace=True)
homework_copy.loc["jonathan"].fillna(value=np.mean(homework_copy.loc["jonathan"]),inplace=True)

In [813]:
print(homework_copy == homework)

            assignment1  assignment2  assignment3  assignment4  assignment5
Student                                                                    
Abdessamad         True         True         True         True         True
Noah               True         True         True         True         True
Benjamin           True         True         True         True         True
Logan              True         True         True         True         True
Mehdi              True         True         True         True         True
Mike               True         True         True         True         True
David              True         True         True         True         True
jonathan           True         True         True         True         True


In [814]:
#Aggregation

In [815]:
#Schools = ["School_"+str(i) for i in range(1,5) ]
TOEIC_Score = pd.DataFrame({"School":np.append(["School_"+str(i) for i in range(1,5)],["School_"+str(i) for i in range(4,0,-1)]),
                          "major":["english","engineering","business","english","english","engineering","business","english"],
                          "Student":["Logan","David","jonathan","Noah","abdessamad","nabil","driss","youssef"],
                           "Score":np.random.randint(700,990,8)})
TOEIC_Score                        

Unnamed: 0,School,major,Student,Score
0,School_1,english,Logan,820
1,School_2,engineering,David,898
2,School_3,business,jonathan,756
3,School_4,english,Noah,878
4,School_4,english,abdessamad,729
5,School_3,engineering,nabil,870
6,School_2,business,driss,934
7,School_1,english,youssef,890


In [816]:
#let's grouby the column shcool ,of each school two students took the TOEIC test
TOEIC_Score_by_School = TOEIC_Score.groupby("School")

In [817]:
#the mean of the TOEIC test of each school ,we can clearly see that the columns major and student were ignored 
#the fact that they are not numerical features
TOEIC_Score_by_School.mean()

Unnamed: 0_level_0,Score
School,Unnamed: 1_level_1
School_1,855.0
School_2,916.0
School_3,813.0
School_4,803.5


In [818]:
TOEIC_Score_by_School.sum()

Unnamed: 0_level_0,Score
School,Unnamed: 1_level_1
School_1,1710
School_2,1832
School_3,1626
School_4,1607


In [819]:
TOEIC_Score_by_School.sum().loc["School_2"]

Score    1832
Name: School_2, dtype: int64

In [820]:
TOEIC_Score_by_School.count()

Unnamed: 0_level_0,major,Student,Score
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
School_1,2,2,2
School_2,2,2,2
School_3,2,2,2
School_4,2,2,2


In [821]:
TOEIC_Score_by_School.std()

Unnamed: 0_level_0,Score
School,Unnamed: 1_level_1
School_1,49.497475
School_2,25.455844
School_3,80.610173
School_4,105.35891


In [822]:
TOEIC_Score_by_School.max()

Unnamed: 0_level_0,major,Student,Score
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
School_1,english,youssef,890
School_2,engineering,driss,934
School_3,engineering,nabil,870
School_4,english,abdessamad,878


In [823]:
TOEIC_Score_by_School.min()

Unnamed: 0_level_0,major,Student,Score
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
School_1,english,Logan,820
School_2,business,David,898
School_3,business,jonathan,756
School_4,english,Noah,729


In [824]:
TOEIC_Score_by_School["Score"].max()

School
School_1    890
School_2    934
School_3    870
School_4    878
Name: Score, dtype: int64

In [825]:
TOEIC_Score_by_School["Score"].min()

School
School_1    820
School_2    898
School_3    756
School_4    729
Name: Score, dtype: int64

In [826]:
###

In [827]:
School_1 = pd.DataFrame(data=np.random.randint(10,20,(4,4)),index=[0,1,2,3],columns=["maths","physics","physchology","chemistry"])
School_2 = pd.DataFrame(data=np.random.randint(10,20,(4,4)),index=[4,5,6,7],columns=["maths","physics","physchology","chemistry"])
School_3 = pd.DataFrame(data=np.random.randint(10,20,(4,4)),index=[8,9,10,11],columns=["maths","physics","physchology","chemistry"])
School_4 = pd.DataFrame(data=np.random.randint(10,20,(3,4)),index=[12,13,14],columns=["maths","physics","physchology","chemistry"])
print(str(School_1)+"\n",str(School_2)+"\n",str(School_3)+"\n",School_4)

   maths  physics  physchology  chemistry
0     16       19           18         12
1     12       18           19         11
2     19       11           15         15
3     10       17           11         19
    maths  physics  physchology  chemistry
4     17       17           17         18
5     11       19           19         12
6     19       10           18         15
7     13       18           16         19
     maths  physics  physchology  chemistry
8      11       12           13         13
9      19       11           14         14
10     14       15           14         16
11     18       15           11         12
     maths  physics  physchology  chemistry
12     15       10           12         18
13     14       13           12         11
14     18       16           11         15


In [828]:
#Concatenation dataframes is nothing other than that putting together or combining dataframes together that have the same dimensions along the axis we will concatenate on 

In [829]:
#let's concatenate rows wise
pd.concat([School_1,School_2,School_3,School_4],axis=0)

Unnamed: 0,maths,physics,physchology,chemistry
0,16,19,18,12
1,12,18,19,11
2,19,11,15,15
3,10,17,11,19
4,17,17,17,18
5,11,19,19,12
6,19,10,18,15
7,13,18,16,19
8,11,12,13,13
9,19,11,14,14


In [830]:
#let's concatenate column wise
pd.concat([School_1,School_2,School_3,School_4],axis=1)

Unnamed: 0,maths,physics,physchology,chemistry,maths.1,physics.1,physchology.1,chemistry.1,maths.2,physics.2,physchology.2,chemistry.2,maths.3,physics.3,physchology.3,chemistry.3
0,16.0,19.0,18.0,12.0,,,,,,,,,,,,
1,12.0,18.0,19.0,11.0,,,,,,,,,,,,
2,19.0,11.0,15.0,15.0,,,,,,,,,,,,
3,10.0,17.0,11.0,19.0,,,,,,,,,,,,
4,,,,,17.0,17.0,17.0,18.0,,,,,,,,
5,,,,,11.0,19.0,19.0,12.0,,,,,,,,
6,,,,,19.0,10.0,18.0,15.0,,,,,,,,
7,,,,,13.0,18.0,16.0,19.0,,,,,,,,
8,,,,,,,,,11.0,12.0,13.0,13.0,,,,
9,,,,,,,,,19.0,11.0,14.0,14.0,,,,


we are observing a lot of null values in our dataframe ,to understand that let's take the first dataframe as an example we had values for indexes 0,2,3,4 but we had no values for index between 4-14 that's we have null values there

In [831]:
mathematics = pd.DataFrame(data=np.random.randint(10,20,(3,3)),index=["Student"+str(i) for i in range(1,4)],columns=["Linear_Algebra","Probability","Statistics"])
computer_science = pd.DataFrame(data=np.random.randint(10,20,(4,3)),index=["Student"+str(i) for i in range(1,5)],columns=["Data_Structures","Java","WEB_DEV"])
print(str(mathematics)+"\n",computer_science)

          Linear_Algebra  Probability  Statistics
Student1              16           12          18
Student2              18           15          15
Student3              12           15          13
           Data_Structures  Java  WEB_DEV
Student1               13    19       12
Student2               18    13       11
Student3               12    14       15
Student4               14    18       13


In [832]:
#when we concatenate columns' wise we can see that each student can get the whole picture of his marks excpect Student 4 the fact that he didnt pass the mathematics' exams
pd.concat([mathematics,computer_science],axis=1)

Unnamed: 0,Linear_Algebra,Probability,Statistics,Data_Structures,Java,WEB_DEV
Student1,16.0,12.0,18.0,13,19,12
Student2,18.0,15.0,15.0,18,13,11
Student3,12.0,15.0,13.0,12,14,15
Student4,,,,14,18,13


In [833]:
#Merging

In [834]:
mathematics.reset_index(inplace=True)
mathematics.rename(columns={"index":"ID"},inplace=True)
mathematics

Unnamed: 0,ID,Linear_Algebra,Probability,Statistics
0,Student1,16,12,18
1,Student2,18,15,15
2,Student3,12,15,13


In [835]:
computer_science.reset_index(inplace=True)
computer_science.rename(columns={"index":"ID"},inplace=True)
computer_science

Unnamed: 0,ID,Data_Structures,Java,WEB_DEV
0,Student1,13,19,12
1,Student2,18,13,11
2,Student3,12,14,15
3,Student4,14,18,13


In [836]:
#we performed a database style join based on the key ID ,we can see that the student4 was ignored the fact he is not present in the mathematics' dataframe and we re performing an inner join that returns records that have matching values in both tables
pd.merge(computer_science,mathematics,on="ID")


Unnamed: 0,ID,Data_Structures,Java,WEB_DEV,Linear_Algebra,Probability,Statistics
0,Student1,13,19,12,16,12,18
1,Student2,18,13,11,18,15,15
2,Student3,12,14,15,12,15,13


In [837]:
#to get the student 4 we must perform a left join or a full outer join
pd.merge(computer_science,mathematics,how="left",on="ID")

Unnamed: 0,ID,Data_Structures,Java,WEB_DEV,Linear_Algebra,Probability,Statistics
0,Student1,13,19,12,16.0,12.0,18.0
1,Student2,18,13,11,18.0,15.0,15.0
2,Student3,12,14,15,12.0,15.0,13.0
3,Student4,14,18,13,,,


In [838]:
#joining works the same way as merging but the key this time is present on the index instead of the colum ,let's prepare our dataframes

In [839]:
computer_science.set_index("ID",inplace=True)
computer_science

Unnamed: 0_level_0,Data_Structures,Java,WEB_DEV
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Student1,13,19,12
Student2,18,13,11
Student3,12,14,15
Student4,14,18,13


In [840]:
mathematics.set_index("ID",inplace=True)
mathematics

Unnamed: 0_level_0,Linear_Algebra,Probability,Statistics
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Student1,16,12,18
Student2,18,15,15
Student3,12,15,13


In [841]:
#we performed an inner join on the index ,the fact that it is an inner join the row student 4 was ignored
mathematics.join(computer_science)

Unnamed: 0_level_0,Linear_Algebra,Probability,Statistics,Data_Structures,Java,WEB_DEV
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Student1,16,12,18,13,19,12
Student2,18,15,15,18,13,11
Student3,12,15,13,12,14,15


In [842]:
#we performed a right join on the index ,the fact that it is an right join the row student 4 was not ignored this time
mathematics.join(computer_science,how="right")

Unnamed: 0_level_0,Linear_Algebra,Probability,Statistics,Data_Structures,Java,WEB_DEV
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Student1,16.0,12.0,18.0,13,19,12
Student2,18.0,15.0,15.0,18,13,11
Student3,12.0,15.0,13.0,12,14,15
Student4,,,,14,18,13


In [843]:
###

In [844]:
marks = pd.DataFrame(data=np.random.randint(5,20,(200,4)),index=["student"+str(i) for i in range(1,201)],columns=["maths_mark","physics_mark","chemistry_mark","french_mark"])
marks

Unnamed: 0,maths_mark,physics_mark,chemistry_mark,french_mark
student1,13,17,17,16
student2,6,16,5,7
student3,18,15,7,8
student4,7,17,6,13
student5,12,19,11,9
...,...,...,...,...
student196,5,6,17,5
student197,17,17,18,17
student198,12,17,12,5
student199,5,10,10,15


In [845]:
#to get unique values of a column in panda
marks["maths_mark"].unique()

array([13,  6, 18,  7, 12,  8,  9, 15, 17, 16, 10, 19, 14,  5, 11])

In [846]:
#to get the number of  unique values of a column in panda
marks["maths_mark"].nunique()

15

In [847]:
#to see how many times a unique value occurs ,we re talking about the frequence
marks["maths_mark"].value_counts()

5     18
14    17
8     16
13    15
15    14
9     14
6     14
17    13
12    13
16    12
11    12
10    12
19    10
18    10
7     10
Name: maths_mark, dtype: int64

In [848]:
#let's build a function that will allow us to add 1 point if a student has a mark above 15 or else we add 2 points
def func_bonus(d):
    if d<=15:
        d=d+2
    else:
        d=d+1
    return d                      

In [849]:
marks["maths_mark"].apply(func_bonus)

student1      15
student2       8
student3      19
student4       9
student5      14
              ..
student196     7
student197    18
student198    14
student199     7
student200     8
Name: maths_mark, Length: 200, dtype: int64

In [850]:
#we convert the function to a lambda function to reduce the lines of code
marks["maths_mark"].apply(lambda d: d+2 if d<=15 else d+1)

student1      15
student2       8
student3      19
student4       9
student5      14
              ..
student196     7
student197    18
student198    14
student199     7
student200     8
Name: maths_mark, Length: 200, dtype: int64

In [851]:
marks

Unnamed: 0,maths_mark,physics_mark,chemistry_mark,french_mark
student1,13,17,17,16
student2,6,16,5,7
student3,18,15,7,8
student4,7,17,6,13
student5,12,19,11,9
...,...,...,...,...
student196,5,6,17,5
student197,17,17,18,17
student198,12,17,12,5
student199,5,10,10,15


In [852]:
marks.index

Index(['student1', 'student2', 'student3', 'student4', 'student5', 'student6',
       'student7', 'student8', 'student9', 'student10',
       ...
       'student191', 'student192', 'student193', 'student194', 'student195',
       'student196', 'student197', 'student198', 'student199', 'student200'],
      dtype='object', length=200)

In [853]:
marks.columns

Index(['maths_mark', 'physics_mark', 'chemistry_mark', 'french_mark'], dtype='object')

In [854]:
#let's sort student by their marks in mathematics
marks.sort_values(by="maths_mark")

Unnamed: 0,maths_mark,physics_mark,chemistry_mark,french_mark
student90,5,17,17,7
student199,5,10,10,15
student132,5,16,12,18
student121,5,7,16,17
student35,5,10,6,19
...,...,...,...,...
student28,19,15,7,11
student107,19,7,10,7
student166,19,16,17,15
student19,19,11,11,14
