## Author - Adhiraj Saha

### Importing the pandas package

In [2]:
# Importing pandas package
import pandas as pd

In [3]:
# Some prerequisites

import numpy as np
grades = [50,50,47,97,49,3,53,42,26,74,82,62,37,15,70,27,36,35,48,52,63,64]
study_hours = [10.0,11.5,9.0, 16.0, 9.25,1.0,11.5,9.0, 8.5, 14.5, 15.5, 13.75,9.0,8.0, 15.5,8.0,9.0, 6.0, 10.0,12.0, 12.5, 12.0]

student_data = np.array([study_hours, grades])
student_data

array([[10.  , 11.5 ,  9.  , 16.  ,  9.25,  1.  , 11.5 ,  9.  ,  8.5 ,
        14.5 , 15.5 , 13.75,  9.  ,  8.  , 15.5 ,  8.  ,  9.  ,  6.  ,
        10.  , 12.  , 12.5 , 12.  ],
       [50.  , 50.  , 47.  , 97.  , 49.  ,  3.  , 53.  , 42.  , 26.  ,
        74.  , 82.  , 62.  , 37.  , 15.  , 70.  , 27.  , 36.  , 35.  ,
        48.  , 52.  , 63.  , 64.  ]])

### Using DataFrame to represent data

In [4]:
df_students = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic', 'Jimmie',
                                     'Rhonda', 'Giovanni', 'Francesca', 'Rajab', 'Naiyana', 'Kian', 'Jenny',
                                     'Jakeem','Helena','Ismat','Anila','Skye','Daniel','Aisha'],
                            'StudyHours':student_data[0],
                            'Grade':student_data[1]})

df_students

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0
5,Vicky,1.0,3.0
6,Frederic,11.5,53.0
7,Jimmie,9.0,42.0
8,Rhonda,8.5,26.0
9,Giovanni,14.5,74.0


#### loc[ ] method

In [5]:
# Retrive data for a specific index value
df_students.loc[5]

Name          Vicky
StudyHours      1.0
Grade           3.0
Name: 5, dtype: object

In [6]:
# Data of a range of index values
df_students.loc[0:5]

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0
5,Vicky,1.0,3.0


In [7]:
# Data of the first five rows only
df_students.iloc[0:5]

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0


In [8]:
# grade column data of 2nd index
df_students.loc[2,'Grade']

47.0

In [12]:
# Data based on a filtering expression that references named columns
print(df_students.loc[df_students["Name"]=="Aisha"])
print()

# Filtering expression can be used without
print(df_students[df_students["Name"]=="Aisha"])

     Name  StudyHours  Grade
21  Aisha        12.0   64.0

     Name  StudyHours  Grade
21  Aisha        12.0   64.0


#### query() method

In [14]:
# Same filtered result but using query method
df_students.query('Name=="Aisha"')

Unnamed: 0,Name,StudyHours,Grade
21,Aisha,12.0,64.0


#### Column name as a property of DataFrame

In [17]:
df_students[df_students.Name == "Aisha"]

Unnamed: 0,Name,StudyHours,Grade
21,Aisha,12.0,64.0


### Loading a DataFrame from a file

In [41]:
!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/ml-basics/grades.csv

# read_csv() method loads the data from the file to a variable
# delimeter is with what the data will be separated
# header is which row contains the column headings
df_students = pd.read_csv('grades.csv',delimiter=',',header='infer')

# Showing the data of 1st 5 rows as the data is too large
# head() method shows only the 1st 5 rows
df_students.head()

--2023-07-06 18:27:42--  https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/ml-basics/grades.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 322 [text/plain]
Saving to: ‘grades.csv.6’


2023-07-06 18:27:42 (12.4 MB/s) - ‘grades.csv.6’ saved [322/322]



Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0


### Handling missing Values

#### isnull() method

In [38]:
# True denotes null values and False for not null values
df_students.isnull()

Unnamed: 0,Name,StudyHours,Grade
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,False,False,False
8,False,False,False
9,False,False,False


In [39]:
# Sum of all the data that have null values for each column
df_students.isnull().sum()

Name          0
StudyHours    1
Grade         2
dtype: int64

In [32]:
# Filtering the DataFrame to view only the rows having null values

# axis=0 denotes the rows and axis=1 denotes columns
df_students[df_students.isnull().any(axis=1)]

Unnamed: 0,Name,StudyHours,Grade
22,Bill,8.0,
23,Ted,10.413043,


#### filna() method

In [40]:
# Replacing the null values with average of that particular column data
df_students.StudyHours = df_students.StudyHours.fillna(df_students.StudyHours.mean())
df_students

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0
5,Vicky,1.0,3.0
6,Frederic,11.5,53.0
7,Jimmie,9.0,42.0
8,Rhonda,8.5,26.0
9,Giovanni,14.5,74.0


#### dropna() method

In [42]:
# Dropping the rows containing null values
df_students = df_students.dropna(axis=0, how='any')
df_students

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0
5,Vicky,1.0,3.0
6,Frederic,11.5,53.0
7,Jimmie,9.0,42.0
8,Rhonda,8.5,26.0
9,Giovanni,14.5,74.0


### Explorer data in the DataFrame

In [55]:
# Getting the mean StudyHours using column as an index
mean_study = df_students['StudyHours'].mean()

# Getting the mean Grade using the column name as a property
mean_grade = df_students.Grade.mean()

print(f"Average weekly study hours: {mean_study:.4} \nAverage grade: {mean_grade:.4}")

Average weekly study hours: 10.52 
Average grade: 49.18


In [58]:
# Data of students who studied for equal to or more than mean hours
df_students[df_students.StudyHours > mean_study]

Unnamed: 0,Name,StudyHours,Grade
1,Joann,11.5,50.0
3,Rosie,16.0,97.0
6,Frederic,11.5,53.0
9,Giovanni,14.5,74.0
10,Francesca,15.5,82.0
11,Rajab,13.75,62.0
14,Jenny,15.5,70.0
19,Skye,12.0,52.0
20,Daniel,12.5,63.0
21,Aisha,12.0,64.0


In [61]:
# Filtered data itself is a DataFrame
# So mean grade of those students who
# studied for equal to or more than mean hours
df_students[df_students.StudyHours > mean_study].Grade.mean(0)

66.7