## Pandas basics


In [None]:
import pandas as pd

### Series

In [None]:
values = [1, 3, 9, 10]

series = pd.Series(values)

series
# Talk about the type, modify one value and show seriesastype(int)

In [None]:
# Indices

series = pd.Series(values, index = ['a', 'b', 'c', 'd'])

series

In [None]:
# Using a dict
stock_prices = { 
    'AAPL': 175.2, 'MSFT': 332.2, 'GOOG': 125.2, 'AMAZ': 120.4,
    'NVIDA': 389.0, 'META': 262.1, 'TSLA': 193.2
}

series = pd.Series(stock_prices)

series

### DataFrames

In [None]:
# From a dictionary
shapes = {
    'width': [12.1, 15.2, 55, 19.1, 20.1],
    'length': [2, 12.1, 8.0, 9.6, 10],
    'height': [3, 4.2, 8.2, 3.2, 3.1]
}

df = pd.DataFrame(shapes)

df = pd.DataFrame(shapes, index=['shape1', 'shape2', 'shape3', 'shape4', 'shape5'])

df

In [None]:
# Get a row/rows (with name and index)
df.iloc[0]

In [None]:
# Get a column/columns
df['width']

In [None]:
# Get a range of rows and columns
df.loc[['shape1', 'shape2'], ['width', 'height']]

### Wine quality dataset

In [None]:
wine_quality_url = "./data/winequality-red.csv"

df = pd.read_csv(wine_quality_url)

In [None]:
# Head
df.head()

In [None]:
# Info
df.info()

In [None]:
# isnull
df.isnull()

In [None]:
# Aggregating methods such as sum, mean, median etc.
df.sum()

# df.isnull().any(axis = 0)

In [None]:
# Boolean masking
# list of indices to a boolean masking [0,3,6] -> [True, False, False, True, False, False, True]

s = pd.Series([10.2, 12.1, 13., 14, 20, 12, 40.1])

s[[0,3,6]]
s[[True, False, False, True, False, False, True]]

In [None]:
# Query with boolean masking
mask = df['residual sugar'] > 8

print(mask)

df[mask]

In [None]:
# More Complex Query, find all pH values for wines with citric acid between 0.7 and 0.75
df[(df['citric acid'] > 0.7) & (df['citric acid'] < 0.75)]['pH']

### Exercise 2: Calculate the mean value of pH and density for all wines with quality equal and above 7

### Exercise 3 (Bonus): Print all rows with missing values

### Cleaning Data

In [None]:
df.fillna({'citric acid': 0})

In [None]:
df.drop(columns=['chlorides'], inplace=True)

In [None]:
df.ffill()

### Modifiying

In [None]:
# Creating a new column
df['total acidity'] = df['volatile acidity'] + df['citric acid']

In [None]:
# Aggregate using one or more operations over the specified axis.
df.agg(['sum', 'min', 'max', 'mean', 'std'])

In [None]:
# Group by: calculating the mean of each column with each quality group
df.groupby('quality').mean()

## Plotting

In [None]:
df['pH'].plot.hist()

In [None]:
import seaborn

seaborn.pairplot(df.iloc[:100, :4])