### Setup

In [0]:
# import statements
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
# create dataframe for iris dataset
df = pd.read_csv("https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv")

In [0]:
# display the head of the data
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [0]:
# display the tail of the data
df.tail()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica
149,5.9,3.0,5.1,1.8,Virginica


In [0]:
# display the shape of the data
df.shape

(150, 5)

### Exploratory Data Analysis

In [0]:
# display descriptive statistics
df.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


### Data Visualization

In [0]:
# define species names and corresponding colors
species_colors = {
    'Setosa': 'red',
    'Versicolor': 'green',
    'Virginica': 'blue'
}

# create a scatter plot for each species
plt.figure(figsize=(10, 8))

for species, color in species_colors.items():
    subset = df[df['variety'] == species]
    plt.scatter(subset['sepal.length'], subset['sepal.width'], label=species, c=color)

plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.title('Scatter Plot: Sepal Length vs. Sepal Width')
plt.legend(title='Species')
plt.show()

### Data Manipulation

In [0]:
# filter rows where sepal length is greater than 5.5
filtered_data = df[df['sepal.length'] > 5.5]
filtered_data

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
14,5.8,4.0,1.2,0.2,Setosa
15,5.7,4.4,1.5,0.4,Setosa
18,5.7,3.8,1.7,0.3,Setosa
50,7.0,3.2,4.7,1.4,Versicolor
51,6.4,3.2,4.5,1.5,Versicolor
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [0]:
# sort the dataframe by petal length in ascending order
sorted_data = df.sort_values(by='petal.length', ascending=True)
sorted_data

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
22,4.6,3.6,1.0,0.2,Setosa
13,4.3,3.0,1.1,0.1,Setosa
14,5.8,4.0,1.2,0.2,Setosa
35,5.0,3.2,1.2,0.2,Setosa
36,5.5,3.5,1.3,0.2,Setosa
...,...,...,...,...,...
131,7.9,3.8,6.4,2.0,Virginica
105,7.6,3.0,6.6,2.1,Virginica
117,7.7,3.8,6.7,2.2,Virginica
122,7.7,2.8,6.7,2.0,Virginica


In [0]:
# calculate the mean sepal length and sepal width by species
grouped_data = df.groupby('variety')[['sepal.length', 'sepal.width']].mean()
grouped_data

Unnamed: 0_level_0,sepal.length,sepal.width
variety,Unnamed: 1_level_1,Unnamed: 2_level_1
Setosa,5.006,3.428
Versicolor,5.936,2.77
Virginica,6.588,2.974
