In [2]:
import pandas as pd
import numpy as np

# Load the Iris dataset
data = pd.read_csv('Iris.csv')


# Task 1: Data Inspection and Missing Value Handling

# Inspect the dataset
print("Dataset Info:")
print(data.info())
print("\nMissing Values in Each Column:")
print(data.isnull().sum())

# Handle missing values in numeric columns
numeric_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
for col in numeric_columns:
    data[col].fillna(data[col].mean(), inplace=True)

# Handle missing values in categorical column
if 'species' in data.columns:
    most_frequent_species = data['species'].mode()[0]
    data['species'].fillna(most_frequent_species, inplace=True)

# Task 2: Data Cleaning and Transformation

# Remove duplicate entries
data.drop_duplicates(inplace=True)

# Create a new column for petal area
data['petal_area'] = data['petal_length'] * data['petal_width']

# Drop rows with any remaining missing values
data.dropna(inplace=True)

# Task 3: Aggregation and Transformation

# Convert categorical data to numeric
if 'species' in data.columns:
    data['species_numeric'] = data['species'].astype('category').cat.codes

# Aggregation: Calculate mean of numeric columns grouped by species
grouped_data = data.groupby('species').mean()
print("\nMean Values Grouped by Species:")
print(grouped_data)

# Task 4: Advanced Reshaping

# Reshape the data to long format
long_format = pd.melt(data, 
                      id_vars=['species'], 
                      value_vars=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], 
                      var_name='measurement_type', 
                      value_name='measurement_value')

# Display reshaped data
print("\nLong Format Data:")
print(long_format.head())

# Save the cleaned and reshaped data to CSV files (optional)
data.to_csv('cleaned_iris.csv', index=False)
long_format.to_csv('long_format_iris.csv', index=False)


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
None

Missing Values in Each Column:
Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


KeyError: 'sepal_length'

In [6]:
import pandas as pd
import numpy as np

# Load the Iris dataset
data = pd.read_csv('Iris.csv')

# Inspect column names to match them dynamically
print("Column Names:", data.columns)

# Update column names to match the dataset
data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_')

# Task 1: Data Inspection and Missing Value Handling

# Inspect the dataset
print("Dataset Info:")
print(data.info())
print("\nMissing Values in Each Column:")
print(data.isnull().sum())

# Handle missing values in numeric columns
numeric_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
for col in numeric_columns:
    if col in data.columns:
        data[col].fillna(data[col].mean(), inplace=True)

# Handle missing values in categorical column
if 'Species' in data.columns:
    most_frequent_species = data['Species'].mode()[0]
    data['Species'].fillna(most_frequent_species, inplace=True)

# Task 2: Data Cleaning and Transformation

# Remove duplicate entries
data.drop_duplicates(inplace=True)

# Create a new column for petal area
if 'petal_length' in data.columns and 'petal_width' in data.columns:
    data['petal_area'] = data['petal_length'] * data['petal_width']

# Drop rows with any remaining missing values
data.dropna(inplace=True)

# Task 3: Aggregation and Transformation

# Convert categorical data to numeric
if 'Species' in data.columns:
    data['species_numeric'] = data['Species'].astype('category').cat.codes

# Aggregation: Calculate mean of numeric columns grouped by species
if 'Species' in data.columns:
    grouped_data = data.groupby('Species').mean()
    print("\nMean Values Grouped by Species:")
    print(grouped_data)

# Task 4: Advanced Reshaping

# Reshape the data to long format
long_format = pd.melt(data, 
                      id_vars=['Species'], 
                      value_vars=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], 
                      var_name='measurement_type', 
                      value_name='measurement_value')

# Display reshaped data
print("\nLong Format Data:")
print(long_format.head())

# Save the cleaned and reshaped data to CSV files (optional)
data.to_csv('cleaned_iris.csv', index=False)
long_format.to_csv('long_format_iris.csv', index=False)

Column Names: Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             150 non-null    int64  
 1   sepallengthcm  150 non-null    float64
 2   sepalwidthcm   150 non-null    float64
 3   petallengthcm  150 non-null    float64
 4   petalwidthcm   150 non-null    float64
 5   species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
None

Missing Values in Each Column:
id               0
sepallengthcm    0
sepalwidthcm     0
petallengthcm    0
petalwidthcm     0
species          0
dtype: int64


KeyError: "The following id_vars or value_vars are not present in the DataFrame: ['Species', 'sepal_length', 'sepal_width', 'petal_length', 'petal_width']"

In [5]:


import pandas as pd
import numpy as np

# Load the Iris dataset
iris = pd.read_csv('Iris.csv')

# Check for missing values
print(iris.isnull().sum())



# Replace missing values in numeric columns with the mean value
iris['sepallengthcm'] = iris['sepallengthcm'].fillna(iris['sepallengthcm'].mean())
iris['sepalwidthcm'] = iris['sepalwidthcm'].fillna(iris['sepalwidthcm'].mean())
iris['petallengthcm'] = iris['petallengthcm'].fillna(iris['petallengthcm'].mean())
iris['petalwidthcm'] = iris['petalwidthcm'].fillna(iris['petalwidthcm'].mean())




# Replace missing values in the species column with the most frequent value
iris['species'] = iris['species'].fillna(iris['species'].mode()[0])


Task 2: Data Cleaning and Transformation

Step 2.1: Remove Duplicate Entries


# Remove duplicate rows
iris = iris.drop_duplicates()


Step 2.2: Create a New Column by Modifying Existing Ones


# Create a new column for petal area
iris['petal_area'] = iris['petal_length'] * iris['petal_width']


Step 2.3: Drop Rows with Any Remaining Missing Values


# Drop rows with any remaining missing values
iris = iris.dropna()


Task 3: Aggregation and Transformation

Step 3.1: Convert Categorical Data to Numeric


# Convert the species column to numeric values
iris['species_code'] = pd.Categorical(iris['species']).codes


Step 3.2: Aggregation


# Calculate the mean of each numeric column grouped by species
mean_values = iris.groupby('species_code')[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].mean()
print(mean_values)


Task 4: Advanced Reshaping

Step 4.1: Reshape the Data


# Reshape the dataset from wide to long format
iris_long = pd.melt(iris, id_vars=['species', 'species_code'], value_vars=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
print(iris_long.head())




SyntaxError: invalid syntax (3689728516.py, line 25)

In [9]:
import pandas as pd
import numpy as np

# Load the Iris dataset
iris = pd.read_csv('iris.csv')

# Check for missing values
print(iris.isnull().sum())



# Replace missing values in numeric columns with the mean value
iris['SepalLengthCm'] = iris['SepalLengthCm'].fillna(iris['SepalLengthCm'].mean())
iris['SepalWidthCm'] = iris['SepalWidthCm'].fillna(iris['SepalWidthCm'].mean())
iris['PetalLengthCm'] = iris['PetalLengthCm'].fillna(iris['PetalLengthCm'].mean())
iris['PetalWidthCm'] = iris['PetalWidthCm'].fillna(iris['PetalWidthCm'].mean())



Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [10]:
iris['Species'] = iris['Species'].fillna(iris['Species'].mode()[0])



In [11]:
iris = iris.drop_duplicates()


In [13]:
iris['petal_area'] = iris['PetalLengthCm'] * iris['PetalWidthCm']
iris.head()


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,petal_area
0,1,5.1,3.5,1.4,0.2,Iris-setosa,0.28
1,2,4.9,3.0,1.4,0.2,Iris-setosa,0.28
2,3,4.7,3.2,1.3,0.2,Iris-setosa,0.26
3,4,4.6,3.1,1.5,0.2,Iris-setosa,0.3
4,5,5.0,3.6,1.4,0.2,Iris-setosa,0.28


In [14]:
iris = iris.dropna()


In [16]:
iris['Species'] = pd.Categorical(iris['Species']).codes
iris.tail(10)


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,petal_area
140,141,6.7,3.1,5.6,2.4,2,13.44
141,142,6.9,3.1,5.1,2.3,2,11.73
142,143,5.8,2.7,5.1,1.9,2,9.69
143,144,6.8,3.2,5.9,2.3,2,13.57
144,145,6.7,3.3,5.7,2.5,2,14.25
145,146,6.7,3.0,5.2,2.3,2,11.96
146,147,6.3,2.5,5.0,1.9,2,9.5
147,148,6.5,3.0,5.2,2.0,2,10.4
148,149,6.2,3.4,5.4,2.3,2,12.42
149,150,5.9,3.0,5.1,1.8,2,9.18


In [None]:
mean_values = iris.groupby('species')[['SepalLengthCm', 'sepalWidthCm', 'petalLengthCm', 'petalWidthCm']].mean()
print(mean_values)
