### Pandas Introduction

In [11]:
import pandas as pd

# Create a simple Pandas Series
mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pd.DataFrame(mydataset)

myvar

Unnamed: 0,cars,passings
0,BMW,3
1,Volvo,7
2,Ford,2


### Pandas Series

In [12]:

import pandas as pd

# Create a simple Pandas Series from a list
a = [1, 7, 2]
myvar = pd.Series(a)
myvar

# Create labels
print("\nCreate labels:")
myvar = pd.Series(a, index = ["x", "y", "z"])
print(myvar)

# Accessing an item
print("\nAccessing an item:")
print(myvar["y"])

# Key/Value Objects as Series
print("\nKey/Value Objects as Series")
calories = {"day1": 420, "day2": 380, "day3": 390}
myvar = pd.Series(calories)
print(myvar)

# Create a Series using only data from "day1" and "day2"
print("\nCreate a Series using only data from 'day1' and 'day2'")
myvar = pd.Series(calories, index = ["day1", "day2"])
myvar


Create labels:
x    1
y    7
z    2
dtype: int64

Accessing an item:
7

Key/Value Objects as Series
day1    420
day2    380
day3    390
dtype: int64

Create a Series using only data from 'day1' and 'day2'


Unnamed: 0,0
day1,420
day2,380


### Pandas Dataframe

In [10]:
import pandas as pd


# Create a simple Pandas DataFrame:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)
df

#refer to the row index:
print(df.loc[0])

#use a list of indexes:
print(df.loc[[0, 1]])

import pandas as pd

data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index = ["day1", "day2", "day3"])


#refer to the named index:
print(df.loc["day2"])

calories    420
duration     50
Name: 0, dtype: int64
   calories  duration
0       420        50
1       380        40
calories    380
duration     40
Name: day2, dtype: int64


### Pandas Read CSV

In [None]:
import pandas as pd

# ### Pandas Introduction
#This part is already in the provided code.


# ### Pandas Series
#This part is already in the provided code.


# ### Pandas Dataframe
#This part is already in the provided code.

# Load a CSV file into a Pandas DataFrame
df = pd.read_csv('data.csv') #Replace 'data.csv' with your file path

print(df.to_string()) #Print the entire DataFrame

#Print the first 5 rows of the DataFrame
print(df.head())

#Print the last 5 rows of the DataFrame
print(df.tail())

#Print information about the DataFrame
print(df.info())


#Read a specific number of rows
df = pd.read_csv('data.csv', nrows=5)
print(df.to_string())


#Print max rows
pd.options.display.max_rows = 9999
df = pd.read_csv('data.csv')
df

### Pandas Read JSON

In [None]:
# prompt: copy all examples code from https://www.w3schools.com/python/pandas/pandas_json.asp

import pandas as pd

# Load a JSON file into a Pandas DataFrame
df = pd.read_json('data.json')

print(df.to_string())


# Dictionary as JSON
data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}

df = pd.DataFrame(data)
df

### Pandas - Analyzing DataFrames

In [None]:
# printing the first 10 rows of the DataFrame:

import pandas as pd
df = pd.read_csv('data.csv')
print(df.head(10))

# Print the first 5 rows of the DataFrame:
df = pd.read_csv('data.csv')
print(df.head())

# Print the last 5 rows of the DataFrame:
print(df.tail())

# Print information about the data:
print(df.info())

### Pandas - Cleaning Empty Cells

In [None]:
# Return a new Data Frame with no empty cells:

import pandas as pd
df = pd.read_csv('data.csv')
new_df = df.dropna()
print(new_df.to_string())


# Remove all rows with NULL values:
df = pd.read_csv('data.csv')
df.dropna(inplace = True)
print(df.to_string())

# Replace NULL values with the number 130:
df = pd.read_csv('data.csv')
df.fillna(130, inplace = True)

# Replace NULL values in the "Calories" columns with the number 130:
df = pd.read_csv('data.csv')
df["Calories"].fillna(130, inplace = True)

# Calculate the MEAN, and replace any empty values with it:
df = pd.read_csv('data.csv')
x = df["Calories"].mean()
df["Calories"].fillna(x, inplace = True)

# Calculate the MEDIAN, and replace any empty values with it:
df = pd.read_csv('data.csv')
x = df["Calories"].median()
df["Calories"].fillna(x, inplace = True)

# Calculate the MODE, and replace any empty values with it:
df = pd.read_csv('data.csv')
x = df["Calories"].mode()[0]
df["Calories"].fillna(x, inplace = True)

### Pandas - Cleaning Data of Wrong Format

In [None]:
# Convert to date:
import pandas as pd
df = pd.read_csv('data.csv')
df['Date'] = pd.to_datetime(df['Date'])
print(df.to_string())

# Remove rows with a NULL value in the "Date" column:
df.dropna(subset=['Date'], inplace = True)

### Pandas - Fixing Wrong Data

In [None]:
# Set "Duration" = 45 in row 7:
df.loc[7, 'Duration'] = 45

# Loop through all values in the "Duration" column.
# If the value is higher than 120, set it to 120:

for x in df.index:
  if df.loc[x, "Duration"] > 120:
    df.loc[x, "Duration"] = 120

# Delete rows where "Duration" is higher than 120:

for x in df.index:
  if df.loc[x, "Duration"] > 120:
    df.drop(x, inplace = True)

### Pandas - Removing Duplicates

In [None]:
# Returns True for every row that is a duplicate, otherwise False:
print(df.duplicated())

# Remove all duplicates:
df.drop_duplicates(inplace = True)

### Pandas - Data Correlations

In [None]:
# Show the relationship between the columns:
df.corr()

### Pandas - Plotting

In [None]:
# Import pyplot from Matplotlib and visualize our DataFrame:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('data.csv')
df.plot()
plt.show()

#### Scatter Plot

In [14]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('data.csv')
df.plot(kind = 'scatter', x = 'Duration', y = 'Calories')
plt.show()

# A scatterplot where there are no relationship between the columns:
df = pd.read_csv('data.csv')
df.plot(kind = 'scatter', x = 'Duration', y = 'Maxpulse')
plt.show()

#### Historgram

In [None]:
df["Duration"].plot(kind = 'hist')

