<a href="https://colab.research.google.com/github/YoussefAli10/iris-analysis-group1/blob/main/iris-project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print('Hello, Iris by Youssef')

Hello, Iris by Youssef


In [None]:
print('Hello, Iris by Sibo')

Hello, Iris by Sibo


In [None]:
print('Hello, Iris by Wendy')

Hello, Iris by Wendy


In [None]:
# Store basic dataset metadata
n_rows = 150
dataset_name = "Iris Dataset"

print(f"Dataset: {dataset_name}")
print(f"Number of rows: {n_rows}")


Dataset: Iris Dataset
Number of rows: 150


In [None]:
# Example: check petal length threshold
petal_length = 1.7
if petal_length > 1.5:
    print("Large petal")
else:
    print("Small petal")


Large petal


In [None]:
def classify_flower(sepal_length, petal_length):
    """
    Returns a label 'small'/'medium'/'large' based on petal_length.
    (Simple thresholds for demo; not a trained model.)
    """
    if petal_length < 1.5:
        return "small"
    elif petal_length < 4.5:
        return "medium"
    else:
        return "large"

# quick sanity checks
print(classify_flower(5.1, 1.4))  # small
print(classify_flower(6.0, 3.5))  # medium
print(classify_flower(6.3, 5.0))  # large


small
medium
large


In [None]:
# Iterate through Iris species and print them
species = ["setosa", "versicolor", "virginica"]
for s in species:
    print(s)


setosa
versicolor
virginica


In [None]:
import csv
import pandas as pd
# Import for the enhanced visual tables in Jupyter/Colab
from IPython.display import display, HTML

# --- Configuration ---
# Ensure your downloaded file is named 'iris.csv' and is in the same directory.
RAW_DATA_FILE = 'iris.csv'
SUMMARY_FILE = 'summary.txt'
CLEANED_DATA_FILE = 'iris_cleaned.csv'

print(f"--- Phase 3 Setup: Ready to process {RAW_DATA_FILE} ---")
print("-" * 50)

--- Phase 3 Setup: Ready to process iris.csv ---
--------------------------------------------------


In [38]:
# --- 1. Read the raw Iris CSV file and calculate summary stats ---
sepal_lengths = []
row_count = 0
sepal_length_index = 0

try:
    # Open the file for reading
    with open(RAW_DATA_FILE, mode='r', newline='') as file:
        reader = csv.reader(file)

        for row in reader:
            if not row: continue

            try:
                # Assuming semicolon separator, we split the single string column if needed
                if len(row) == 1 and ';' in row[0]:
                    parts = row[0].split(';')
                else:
                    parts = row

                length = float(parts[sepal_length_index])
                sepal_lengths.append(length)
                row_count += 1
            except (ValueError, IndexError):
                # Handles malformed or non-numeric rows
                pass

except FileNotFoundError:
    print(f"ERROR: The file {RAW_DATA_FILE} was not found. Please check the name and location.")
    row_count = 0

# Calculate stats
if row_count > 0:
    min_length = min(sepal_lengths)
    max_length = max(sepal_lengths)
    avg_length = sum(sepal_lengths) / row_count
else:
    min_length, max_length, avg_length = 'N/A', 'N/A', 'N/A'

# --- 2. Write a summary file (summary.txt) ---
summary_content = f"""
--- IRIS Dataset Summary (CSV Module) ---
Total Rows Processed: {row_count}

Sepal Length (cm) Statistics:
Minimum: {min_length:.2f}
Maximum: {max_length:.2f}
Average: {avg_length:.2f}
"""

with open(SUMMARY_FILE, mode='w') as outfile:
    outfile.write(summary_content.strip())

print(f" Summary written to {SUMMARY_FILE}")

 Summary written to summary.txt


In [None]:
# --- 1. Import the dataset with pandas (read_csv) ---
try:
    # FIX APPLIED HERE: Removing the semicolon separator to use the default comma (,)
    # If the file is comma-delimited, use this:
    df = pd.read_csv(RAW_DATA_FILE, header=None)

    # If the above line gives the error, uncomment the line below and comment out the line above.
    # df = pd.read_csv(RAW_DATA_FILE, header=None, sep=';')

    # Assign standard column names
    df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

    initial_rows_count = len(df)
    print(f"\nPandas Import Successful. Initial shape: {df.shape}")
except Exception as e:
    print(f"An error occurred while loading the data with pandas: {e}")
    df = pd.DataFrame()


if not df.empty:

    # --- 2. Check for missing values, column names, and data types ---

    print("\n-- Initial Data Check (Top 5 Rows - Enhanced Table) --")
    display(df.head().style.set_table_attributes("style='font-size: 10pt;'"))

    print("\n-- Data Types and Non-Null Counts (Textual) --")
    df.info()

    # Visually check for missing values
    print("\n-- Check for Missing Values (Enhanced Table) --")
    missing_data = pd.DataFrame(df.isnull().sum(), columns=['Missing Count'])
    missing_data.index.name = 'Column'
    display(
        missing_data.style
        .bar(subset=['Missing Count'], color='#FFA07A')
        .set_caption("Missing Values Per Column")
    )

    # --- Data Cleaning ---

    # A. Handle Duplicates
    initial_rows_pre_dup_drop = len(df)
    df.drop_duplicates(inplace=True)
    rows_removed_duplicates = initial_rows_pre_dup_drop - len(df)

    # B. Ensure all measurement columns are numeric
    cols_to_convert = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    df[cols_to_convert] = df[cols_to_convert].apply(pd.to_numeric, errors='coerce')

    # C. Drop any rows where measurements became NaN
    nan_check = df.isnull().sum().sum()
    rows_pre_nan_drop = len(df)

    if nan_check > 0:
        print(f"\nWarning: {nan_check} values converted to NaN. Dropping rows with NaNs.")
        df.dropna(inplace=True)

    rows_removed_nan = rows_pre_nan_drop - len(df)
    final_rows_count = len(df)


Pandas Import Successful. Initial shape: (151, 5)

-- Initial Data Check (Top 5 Rows - Enhanced Table) --


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,sepal_length,sepal_width,petal_length,petal_width,species
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa



-- Data Types and Non-Null Counts (Textual) --
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   sepal_length  151 non-null    object
 1   sepal_width   151 non-null    object
 2   petal_length  151 non-null    object
 3   petal_width   151 non-null    object
 4   species       151 non-null    object
dtypes: object(5)
memory usage: 6.0+ KB

-- Check for Missing Values (Enhanced Table) --


  end = (x - left) / (right - left)


Unnamed: 0_level_0,Missing Count
Column,Unnamed: 1_level_1
sepal_length,0
sepal_width,0
petal_length,0
petal_width,0
species,0





In [39]:
# Assuming df is not empty from Block 3

# --- Cleaning Summary Table (Enhanced Visual) ---
print("\n--- Final Cleaning Summary (Enhanced Table) ---")

cleaning_summary_data = {
    'Metric': ['Initial Rows Loaded', 'Duplicates Removed', 'Rows Dropped (NaNs)', 'Final Cleaned Rows'],
    'Value': [initial_rows_count, rows_removed_duplicates, rows_removed_nan, final_rows_count]
}
summary_df_vis = pd.DataFrame(cleaning_summary_data)

# Style the final summary table
display(
    summary_df_vis.style
    .set_properties(**{'background-color': '#f0f8ff', 'border-color': 'white'})
    .hide(axis='index')
    .set_caption("Summary of Data Cleaning Steps")
)

print(f"\nCleaned dataset shape: {df.shape}")

# --- 3. Save cleaned dataset as iris_cleaned.csv ---
df.to_csv(CLEANED_DATA_FILE, index=False)

print(f"\n Cleaned data saved to {CLEANED_DATA_FILE}")


--- Final Cleaning Summary (Enhanced Table) ---


Metric,Value
Initial Rows Loaded,151
Duplicates Removed,3
Rows Dropped (NaNs),1
Final Cleaned Rows,147



Cleaned dataset shape: (0, 0)

 Cleaned data saved to iris_cleaned.csv
