In [1]:
#


In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the data
data_path = 'dairy_product.csv'
data = pd.read_csv(data_path)
data.info()

# Step 2: Handle missing values
missing_percentage = data.isnull().sum() / len(data) * 100
columns_to_drop = missing_percentage[missing_percentage > 80].index
print('\ncolumns_to_drop: ', columns_to_drop)
data = data.drop(columns=columns_to_drop)

# Step 3: Label encode categorical features
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))

# Step 4: Extract independent and dependent variables
# Assuming 'target' is the dependent variable
if 'FoodGroup' in data.columns:
    y = data['FoodGroup']
    X = data.drop(columns=['FoodGroup', 'ID', 'Descrip', 'ShortDescrip'])
else:
    raise ValueError("No 'target' column found in the dataset. Please specify the dependent variable.")

# Step 5: Identify and print low-variance data
selector = VarianceThreshold(threshold=0.01)  # Threshold can be adjusted
selector.fit(X)
low_variance_features = X.columns[~selector.get_support()]
print("\nLow-variance features:", low_variance_features.tolist())

# Step 6: Feature importance using Random Forest
model = RandomForestClassifier(random_state=42)
model.fit(X, y)
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("\nFeature importances RF:")
print(feature_importances)

# Step 7: Drop highly correlated columns
correlation_matrix = X.corr().abs()
upper_triangle = correlation_matrix.where(
    np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
)
highly_correlated = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.7)]
print('<nhighly_correlated: ', highly_correlated )
X = X.drop(columns=highly_correlated)

print("\nRemaining columns after removing high correlation:", X.columns.tolist())

# Save processed data (optional)
X.to_csv('processed_X.csv', index=False)
y.to_csv('processed_y.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8618 entries, 0 to 8617
Data columns (total 45 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                8618 non-null   int64  
 1   FoodGroup         8618 non-null   object 
 2   ShortDescrip      8618 non-null   object 
 3   Descrip           8618 non-null   object 
 4   CommonName        1063 non-null   object 
 5   MfgName           1560 non-null   object 
 6   ScientificName    732 non-null    object 
 7   Energy_kcal       8618 non-null   float64
 8   Protein_g         8618 non-null   float64
 9   Fat_g             8618 non-null   float64
 10  Carb_g            8618 non-null   float64
 11  Sugar_g           8618 non-null   float64
 12  Fiber_g           8618 non-null   float64
 13  VitA_mcg          8618 non-null   float64
 14  VitB6_mg          8618 non-null   float64
 15  VitB12_mcg        8618 non-null   float64
 16  VitC_mg           8618 non-null   float64


In [12]:
# Question2 


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Load processed data
X = pd.read_csv('processed_X.csv')
y = pd.read_csv('processed_y.csv')

# Step 2: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 4: Perform LDA
lda = LinearDiscriminantAnalysis(n_components=2)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)

# Plot LDA result for test data
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_test_lda[:, 0], y=X_test_lda[:, 1], hue=y_test.squeeze(), palette='viridis', alpha=0.7)
plt.title('LDA: Test Data')
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.legend(title='Class')
plt.show()

# Step 5: Build Random Forest model on LDA-transformed data
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_lda, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_lda)
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


  y = column_or_1d(y, warn=True)


IndexError: index 1 is out of bounds for axis 1 with size 1

<Figure size 800x600 with 0 Axes>