<a href="https://colab.research.google.com/github/AndreGulyi/Pandas_for_everyone/blob/main/Pandas_for_everyone_Part_2_(3_Plotting).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Plotting Basics

In [None]:
import pandas as pd
# the anscombe data set can be found in the seaborn library
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.get_dataset_names()

In [None]:
anscombe = sns.load_dataset("anscombe")
#anscombe = pd.read_csv("https://github.com/mwaskom/seaborn-data/blob/master/anscombe.csv")
print(anscombe)

##3.2 Matplotlib Basics

In [None]:
# create a subset of the data
# contains only data set 1 from anscombe
dataset_1 = anscombe[anscombe['dataset'] =='I']
plt.plot(dataset_1['x'], dataset_1['y'])
plt.show() # will need this to show explicitly show the plot

In [None]:
#if we want dots we could add parameter 'o'
plt.plot(dataset_1['x'], dataset_1['y'],'o')
plt.show()

###3.2.1 Figure Objects and Axes Subplots

In [None]:
# create subsets of the anscombe data
dataset_2 = anscombe[anscombe['dataset'] == 'II']
dataset_3 = anscombe[anscombe['dataset'] =='III']
dataset_4 = anscombe[anscombe['dataset'] =='IV']

In [None]:
# create the entire figure where our subplots will go
fig = plt.figure()
# tell the figure how the subplots should be laid out
# in the example, we will have
# 2 row of plots, and each row will have 2 plots
# subplot has 2 rows and 2 columns, plot location 1
axes1 = fig.add_subplot(2, 2, 1)

# subplot has 2 rows and 2 columns, plot location 2
axes2 = fig.add_subplot(2, 2, 2)
# subplot has 2 rows and 2 columns, plot location 3
axes3 = fig.add_subplot(2, 2, 3)

# subplot has 2 rows and 2 columns, plot location 4
axes4 = fig.add_subplot(2, 2, 4)
plt.show()

In [None]:
# you need to run all the plotting code together, same as above
fig = plt.figure()
axes1 = fig.add_subplot(2, 2, 1)
axes2 = fig.add_subplot(2, 2, 2)
axes3 = fig.add_subplot(2, 2, 3)
axes4 = fig.add_subplot(2, 2, 4)
# add a plot to
axes1.plot(dataset_1['x'], dataset_1['y'],'o')
axes2.plot(dataset_2['x'], dataset_2['y'],'o')
axes3.plot(dataset_3['x'], dataset_3['y'],'o')
axes4.plot(dataset_4['x'], dataset_4['y'],'o')

plt.show()

In [None]:
fig = plt.figure()
axes1 = fig.add_subplot(2, 2, 1)
axes2 = fig.add_subplot(2, 2, 2)
axes3 = fig.add_subplot(2, 2, 3)
axes4 = fig.add_subplot(2, 2, 4)
# add a plot to
axes1.plot(dataset_1['x'], dataset_1['y'],'o')
axes2.plot(dataset_2['x'], dataset_2['y'],'o')
axes3.plot(dataset_3['x'], dataset_3['y'],'o')
axes4.plot(dataset_4['x'], dataset_4['y'],'o')

# add a small title to each subplot
axes1.set_title("dataset_1")
axes2.set_title("dataset_2")
axes3.set_title("dataset_3")
axes4.set_title("dataset_4")

# add a title for the entire figure (title above the title)
fig.suptitle("Anscombe Data")
# note spelling of "suptitle"

# use a tight layout so the plots and titles don't overlap
fig.set_tight_layout(True)
# show the figure
plt.show()

###3.3 Statistical Graphics Using matplotlib

In [None]:
tips = sns.load_dataset("tips")
print(tips)

In [None]:
tips.describe().round(1)

In [None]:
# create the figure object
fig = plt.figure()
# subplot has 1 row, 1 column, plot location 1
axes1 = fig.add_subplot(1, 1, 1)
# make the actual histogram
axes1.hist(data=tips, x='total_bill', bins=10)
# add labels
axes1.set_title('Histogram of Total Bill')
axes1.set_ylabel('Frequency')
axes1.set_xlabel('Total Bill')
plt.show()

###3.3.2 Bivariate (Two Variables)

In [None]:
# create the figure object
scatter_plot = plt.figure()
axes1 = scatter_plot.add_subplot(1, 1, 1)
# make the actual scatter plot
axes1.scatter(data=tips, x='total_bill',y='tip')
# add labels
axes1.set_title('Scatterplot of Total Bill vs Tip')
axes1.set_xlabel('Total Bill')
axes1.set_ylabel('Tip')
plt.show()

####3.3.2.2 Box Plot

In [None]:
# create the figure object
boxplot = plt.figure()
axes1 = boxplot.add_subplot(1, 1, 1)

# make the actual box plot
axes1.boxplot(
# first argument of box plot is the data
# since we are plotting multiple pieces of data
# we have to put each piece of data into a list
x=[tips.loc[tips["sex"] == "Female", "tip"],tips.loc[tips["sex"] == "Male", "tip"],],
# we can then pass in an optional labels parameter
# to label the data we passed
labels=["Female", "Male"],)
# add labels
axes1.set_xlabel('Sex')
axes1.set_ylabel('Tip')
axes1.set_title('Boxplot of Tips by Gender')
plt.show()

###3.3.3 Multivariate Data

In [None]:
tips.info()

In [None]:
# assign color values
colors = {
    "Female":"f1a340", #orange
    "Male":"998ec3", #purple
}

scatter_plot = plt.figure()
axes1 = scatter_plot.add_subplot(1,1,1)

axes1.scatter(
    data=tips,
    x='total_bill',
    y='tip',
    # set the size of the dots based on party size
# we multiply the values by 10 to make the points bigger
# and also to emphasize the difference
    s = tips['size']**2*10,

    # set the color for the sex using our color values above
    #c=tips['sex'].map(colors),
    ### error with mapping colors
# set the alpha so points are more transparent
# this helps with overlapping points
  alpha=0.5
)

# label the axes
axes1.set_title('Colored by Sex and Sized by Size')
axes1.set_xlabel('Total Bill')
axes1.set_ylabel('Tip')
# figure title on top
scatter_plot.suptitle("Total Bill vs Tip")
plt.show()

##3.4 Seaborn

In [None]:
import seaborn as sns
tips = sns.load_dataset('tips')
sns.set_context('paper')

In [None]:
# the subplots function is a shortcut for
# creating separate figure objects and
# adding individual subplots (axes) to the figure
hist, ax = plt.subplots()
# use seaborn to draw a histogram into the axes
sns.histplot(data=tips, x="total_bill", ax=ax)
# use matplotlib notation to set a title
ax.set_title('Total Bill Histogram')
# use matplotlib to show the figure
plt.show()

####3.4.1.2 Density Plot (Kernel Density Estimation)

In [None]:
den, ax = plt.subplots()
sns.kdeplot(data=tips, x="total_bill", ax=ax)
ax.set_title('Total Bill Density')
ax.set_xlabel('Total Bill')
ax.set_ylabel('Unit Probability')
plt.show()

####3.4.1.3 Rug Plot

In [None]:
rug, ax = plt.subplots()
# plot 2 things into the axes we created
sns.rugplot(data=tips, x="total_bill", ax=ax)
sns.histplot(data=tips, x="total_bill", ax=ax)
ax.set_title("Rug Plot and Histogram of Total Bill")
ax.set_title("Total Bill")
plt.show()

####3.4.1.4 Distribution Plots

In [None]:
# the FacetGrid object creates the figure and axes for us
fig = sns.displot(data=tips, x="total_bill", kde=True, rug=True)
fig.set_axis_labels(x_var="Total Bill",y_var="Count")
fig.figure.suptitle('Distribution of Total Bill')
plt.show()

####3.4.1.5 Count Plot (Bar Plot)

In [None]:
count, ax = plt.subplots()
# we can use the viridis palette to help distinguish the colors
sns.countplot(data=tips, x='day',
palette="viridis", ax=ax)
ax.set_title('Count of days')
ax.set_xlabel('Day of the Week')
ax.set_ylabel('Frequency')
plt.show()

###3.4.2 Bivariate Data

In [None]:
scatter, ax = plt.subplots()
# use fit_reg=False if you do not want the regression line
sns.scatterplot(data=tips, x='total_bill',y='tip', ax=ax)
ax.set_title('Scatter Plot of Total Bill and Tip')
ax.set_xlabel('Total Bill')
ax.set_ylabel('Tip')
plt.show()

In [None]:
reg, ax = plt.subplots()
# use fit_reg=False if you do not want the regression line
sns.regplot(data=tips, x='total_bill', y='tip', ax=ax)
ax.set_title('Regression Plot of Total Bill and Tip')
ax.set_xlabel('Total Bill')
ax.set_ylabel('Tip')
plt.show()

####3.4.2.2 Joint Plot

In [None]:
# jointplot creates the figure and axes for us
joint = sns.jointplot(data=tips, x='total_bill',y='tip')
joint.set_axis_labels(xlabel='Total Bill',ylabel='Tip')
# add a title and move the text up so it doesn't clash with histogram
joint.figure.suptitle('Joint Plot of Total Bill and Tip', y=1.03)
plt.show()

####3.4.2.3 Hexbin Plot

In [None]:
# we can use jointplot with kind="hex" for a hexbin plot
hexbin = sns.jointplot(
data=tips, x="total_bill", y="tip", kind="hex"
)
hexbin.set_axis_labels(xlabel='Total Bill', ylabel='Tip')
hexbin.figure.suptitle('Hexbin Plot of Total Bill and Tip', y=1.03)
plt.show()

####3.4.2.4 2D Density Plot

In [None]:
kde, ax = plt.subplots()
# shade will fill in the contours
sns.kdeplot(data=tips, x="total_bill", y="tip", shade=True, ax=ax)
ax.set_title('Kernel Density Plot of Total Bill and Tip')
ax.set_xlabel('Total Bill')
ax.set_ylabel('Tip')
plt.show()

In [None]:
kde2d = sns.jointplot(data=tips, x="total_bill",
y="tip", kind="kde")
kde2d.set_axis_labels(xlabel='Total Bill', ylabel='Tip')
kde2d.fig.suptitle('2D KDE Plot of Total Bill and Tip', y=1.03)
plt.show()

####3.4.2.5 Bar Plot

In [None]:
import numpy as np
bar, ax = plt.subplots()
# plot the average total bill for each value of time
# mean is calculated using numpy
sns.barplot(
data=tips, x="time", y="total_bill",
estimator=np.mean, ax=ax)
ax.set_title('Bar Plot of Average Total Bill for Time of Day')
ax.set_xlabel('Time of Day')
ax.set_ylabel('Average Total Bill')
plt.show()

In [None]:
####3.4.2.6 Box Plot


##5.2 Apply (Basics)

In [None]:
df = pd.DataFrame({
    'a':[10,20,30],
    'b':[40,50,60]
})
print(df)

In [None]:
print(df['a']**2)

In [None]:
print(type(df['a']))

In [None]:
print(type(df.iloc[0]))

In [None]:
print(type(df.iloc[:,0:1]))

In [None]:
df.loc[:,['a']]

In [None]:
print(type(df.loc[:,['a']]))

In [None]:
def my_sq(x):
  return x ** 2
#Squares a given value

#Calculates the average of 2 numbers
def avg_2(x, y):
  return (x + y) / 2

In [None]:
# apply our square function on the 'a' column
sq = df['a'].apply(my_sq)
print(sq)

In [None]:
def myexp(x,e):
  return x ** e

In [None]:
cubed = myexp(2,3)
print(cubed)

In [None]:
ex = df['a'].apply(myexp, e=2)
print(ex)

In [None]:
def printme(x):
  print(x)

In [None]:
df.apply(printme, axis=0)

In [None]:
df.apply(printme,axis=1)

In [None]:
df['a_sq_lamb'] = df['a'].apply(lambda x: x ** 2)
print(df)