# Pandas

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("biostats.csv")

In [3]:
df.head()

Unnamed: 0,Name,"""Sex""","""Age""","""Height (in)""","""Weight (lbs)"""
0,Alex,"""M""",41,74,170
1,Bert,"""M""",42,68,166
2,Carl,"""M""",32,70,155
3,Dave,"""M""",39,72,167
4,Elly,"""F""",30,66,124


### Why Use Pandas?

In [5]:
results = {
    "Courses":["CMT 408","CMT 403","CMT 432"],
    "Marks":[67,70,75],
    "Grade":["B","A","A"]
}

resultsDF = pd.DataFrame(results)

print(resultsDF)

   Courses  Marks Grade
0  CMT 408     67     B
1  CMT 403     70     A
2  CMT 432     75     A


### Pandas Series

In [8]:
a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar)

0    1
1    7
2    2
dtype: int64


In [9]:
a = [1, 7, 2]
myvar = pd.Series(a, index = ["x", "y", "z"])
print(myvar)

x    1
y    7
z    2
dtype: int64


### Pandas DataFrames

In [10]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)

print(df) 

   calories  duration
0       420        50
1       380        40
2       390        45


In [11]:
print(df.loc[0])

calories    420
duration     50
Name: 0, dtype: int64


In [12]:
# Named Indexes


data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index = ["day1", "day2", "day3"])

print(df) 

      calories  duration
day1       420        50
day2       380        40
day3       390        45


## Pandas Read JSON

In [15]:
js = pd.read_json('example.json')

In [17]:
print(js.to_string())

                                                                                                              users
0     {'id': 1, 'name': 'John Doe', 'email': 'johndoe@example.com', 'isActive': True, 'roles': ['admin', 'editor']}
1         {'id': 2, 'name': 'Jane Smith', 'email': 'janesmith@example.com', 'isActive': False, 'roles': ['viewer']}
2  {'id': 3, 'name': 'Sam Brown', 'email': 'sambrown@example.com', 'isActive': True, 'roles': ['editor', 'viewer']}


### Pandas - Analyzing DataFrames

In [18]:
bioDF = pd.read_csv("biostats.csv")

In [19]:
print(bioDF.to_string())

    Name       "Sex"   "Age"   "Height (in)"   "Weight (lbs)"
0   Alex         "M"      41              74              170
1   Bert         "M"      42              68              166
2   Carl         "M"      32              70              155
3   Dave         "M"      39              72              167
4   Elly         "F"      30              66              124
5   Fran         "F"      33              66              115
6   Gwen         "F"      26              64              121
7   Hank         "M"      30              71              158
8   Ivan         "M"      53              72              175
9   Jake         "M"      32              69              143
10  Kate         "F"      47              69              139
11  Luke         "M"      34              72              163
12  Myra         "F"      23              62               98
13  Neil         "M"      36              75              160
14  Omar         "M"      38              70              145
15  Page

In [20]:
bioDF.head(10)

Unnamed: 0,Name,"""Sex""","""Age""","""Height (in)""","""Weight (lbs)"""
0,Alex,"""M""",41,74,170
1,Bert,"""M""",42,68,166
2,Carl,"""M""",32,70,155
3,Dave,"""M""",39,72,167
4,Elly,"""F""",30,66,124
5,Fran,"""F""",33,66,115
6,Gwen,"""F""",26,64,121
7,Hank,"""M""",30,71,158
8,Ivan,"""M""",53,72,175
9,Jake,"""M""",32,69,143


In [21]:
bioDF.tail(3)

Unnamed: 0,Name,"""Sex""","""Age""","""Height (in)""","""Weight (lbs)"""
15,Page,"""F""",31,67,135
16,Quin,"""M""",29,71,176
17,Ruth,"""F""",28,65,131


In [22]:
bioDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Name             18 non-null     object
 1        "Sex"       18 non-null     object
 2    "Age"           18 non-null     int64 
 3    "Height (in)"   18 non-null     int64 
 4    "Weight (lbs)"  18 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 848.0+ bytes


# Data Cleaning

In [45]:
dt = pd.read_csv("data.csv")

In [46]:
dt

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


### Empty Cells

In [47]:
tdDF = dt.dropna()

In [48]:
tdDF

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [37]:
#dt.dropna(inplace=True)

In [38]:
#dt

In [49]:
dt["Date"] = dt["Date"].fillna('2024/12/1')

In [50]:
dt

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0
