In [22]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib


In [23]:
df = pd.read_csv('abaid.csv', encoding='ISO-8859-1')

In [24]:
print(df.head())  # Show first 5 rows
print(df.info())  # Show dataset structure


                     Name                                        Description  \
0             Snake Plant  A hardy, drought-tolerant plant with sword-sha...   
1  Mother-in-Law's Tongue  Known for its upright, stiff, green leaves wit...   
2    Saint George's Sword  Thrives in low light conditions and requires m...   
3  Viper's Bowstring Hemp  A natural air purifier that removes toxins lik...   
4    Devil's Tongue Plant  Its leaves grow vertically and can reach up to...   

  Scientific Classification / Taxonomic Term Distribution Area  \
0                           Kingdom: Plantae           Nigeria   
1                  Subkingdom: Tracheobionta             Congo   
2               Superdivision: Spermatophyta             Ghana   
3                    Division: Magnoliophyta           Senegal   
4                          Class: Liliopsida       Ivory Coast   

           Habitat Type     Hemisphere Trend / Climatic Region  \
0  Dry, rocky hillsides      Tropical Hemisphere (West A

In [25]:
columns_mapping = {
    'Name': 'Plant Name',
    'Garden Use': 'Usage',
    'Plant Height Fact': 'Height',
    'Spread Fact': 'Spread',
    'Sunlight Fact': 'Light Requirements',
    'Watering Fact': 'Watering Needs',
    'Pruning Fact': 'Maintenance Level',
    'Leaf Color Fact': 'Features',
    'Flower Size Fact': 'Aesthetic Features',  # Combine if multiple aesthetics
    'Toxicity Fact': 'Toxicity',
    'Growth Rate Fact': 'Growth Rate',
    'Ideal Temperature Fact': 'Temperature Tolerance',
    'Soil Type Fact': 'Soil Type',
    'Common Name' : 'Common Name'
}

# Keep only the relevant columns and rename them
df = df[[col for col in columns_mapping.keys() if col in df.columns]].rename(columns=columns_mapping)

# Display the remaining columns to confirm
print(df.columns.tolist())


['Plant Name', 'Usage', 'Height', 'Spread', 'Light Requirements', 'Watering Needs', 'Maintenance Level', 'Features', 'Aesthetic Features', 'Toxicity', 'Growth Rate', 'Temperature Tolerance', 'Soil Type', 'Common Name']


In [26]:
def clean_text(text):
    if pd.isna(text):
        return None
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

# Apply cleaning function to categorical columns
categorical_columns = ['Plant Name', 'Usage', 'Light Requirements', 'Watering Needs', 
                       'Maintenance Level', 'Features', 'Aesthetic Features', 'Toxicity', 
                       'Growth Rate', 'Soil Type']
for col in categorical_columns:
    df[col] = df[col].apply(clean_text)


In [27]:
def extract_height(text):
    if pd.isna(text):
        return None
    feet = re.findall(r'(\d+)-(\d+) feet', text)
    if feet:
        return (int(feet[0][0]) + int(feet[0][1])) / 2
    feet_only = re.findall(r'(\d+) feet', text)
    if feet_only:
        return int(feet_only[0])
    inches_only = re.findall(r'(\d+) inches', text)
    if inches_only:
        return int(inches_only[0]) / 12
    return None

def extract_spread(text):
    if pd.isna(text):
        return None
    feet = re.findall(r'(\d+)-(\d+) feet', text)
    if feet:
        return (int(feet[0][0]) + int(feet[0][1])) / 2
    feet_only = re.findall(r'(\d+) feet', text)
    if feet_only:
        return int(feet_only[0])
    inches_only = re.findall(r'(\d+) inches', text)
    if inches_only:
        return int(inches_only[0]) / 12
    return None

def extract_temperature(text):
    if pd.isna(text):
        return None
    temp_f = re.findall(r'(\d+)°F', text)
    if temp_f:
        return int(temp_f[0])
    return None


In [28]:
df['Height'] = df['Height'].apply(extract_height)
df['Spread'] = df['Spread'].apply(extract_spread)
df['Temperature Tolerance'] = df['Temperature Tolerance'].apply(extract_temperature)


In [29]:
df['Height'] = df['Height'].fillna(df['Height'].mean())
df['Spread'] = df['Spread'].fillna(df['Spread'].mean())
df['Temperature Tolerance'] = df['Temperature Tolerance'].fillna(df['Temperature Tolerance'].mean())


In [31]:
# Encode categorical variables
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = df[col].astype(str)  # Ensure all values are strings before encoding
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Handle missing values
df.fillna(df.mean(numeric_only=True), inplace=True)  # Fill NaNs in numeric columns
df.fillna(df.mode().iloc[0], inplace=True)  # Fill NaNs in categorical columns

# Split dataset
X = df.drop(columns=['Common Name'])
y = df['Common Name']

# Check for NaN values before training
print(X.isnull().sum())  # Verify NaNs in X
print(y.isnull().sum())  # Verify NaNs in y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Model Accuracy: {accuracy * 100:.2f}%')

# Save the best model
joblib.dump(model, '/mnt/data/best_plant_model.pkl')


Plant Name               0
Usage                    0
Height                   0
Spread                   0
Light Requirements       0
Watering Needs           0
Maintenance Level        0
Features                 0
Aesthetic Features       0
Toxicity                 0
Growth Rate              0
Temperature Tolerance    0
Soil Type                0
dtype: int64
0
Random Forest Model Accuracy: 21.36%


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/best_plant_model.pkl'