# Data Wrangling - Part I
This notebook demonstrates basic data wrangling steps on the Iris dataset.

In [1]:
# 1. Import Required Libraries
import pandas as pd
import numpy as np

### 2. Dataset Source
- Dataset: [Iris Dataset](https://archive.ics.uci.edu/ml/datasets/iris)
- URL: https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv

In [2]:
# 3. Load Dataset
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
# 4. Data Preprocessing
print("Missing Values:\n", df.isnull().sum())
print("\nStatistical Summary:\n", df.describe())
print("\nColumns:\n", df.columns.tolist())
print("Shape of dataset:", df.shape)
print("Data Types:\n", df.dtypes)

Missing Values:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

Statistical Summary:
        sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.057333      3.758000     1.199333
std        0.828066     0.435866      1.765298     0.762238
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000

Columns:
 ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
Shape of dataset: (150, 5)
Data Types:
 sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object


In [4]:
# 5. Data Formatting and Normalization
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = (df[numeric_cols] - df[numeric_cols].min()) / (df[numeric_cols].max() - df[numeric_cols].min())
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.222222,0.625,0.067797,0.041667,setosa
1,0.166667,0.416667,0.067797,0.041667,setosa
2,0.111111,0.5,0.050847,0.041667,setosa
3,0.083333,0.458333,0.084746,0.041667,setosa
4,0.194444,0.666667,0.067797,0.041667,setosa


In [5]:
# 6. Convert Categorical to Quantitative Variables
df_encoded = pd.get_dummies(df, columns=['species'])
df_encoded.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_setosa,species_versicolor,species_virginica
0,0.222222,0.625,0.067797,0.041667,True,False,False
1,0.166667,0.416667,0.067797,0.041667,True,False,False
2,0.111111,0.5,0.050847,0.041667,True,False,False
3,0.083333,0.458333,0.084746,0.041667,True,False,False
4,0.194444,0.666667,0.067797,0.041667,True,False,False
