In [1]:
                                        #Avocado project 

In [2]:
import pandas as pd

# Load the dataset
url = "https://github.com/FlipRoboTechnologies/ML_-Datasets/raw/main/Avocado/avocado.csv.zip"
avocado_data = pd.read_csv(url, compression='zip')

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(avocado_data.head())

# Check the structure of the dataset
print("\nDataset information:")
print(avocado_data.info())

# Summary statistics
print("\nSummary statistics of numerical features:")
print(avocado_data.describe())




First few rows of the dataset:
   Unnamed: 0        Date  AveragePrice  Total Volume     4046       4225  \
0           0  2015-12-27          1.33      64236.62  1036.74   54454.85   
1           1  2015-12-20          1.35      54876.98   674.28   44638.81   
2           2  2015-12-13          0.93     118220.22   794.70  109149.67   
3           3  2015-12-06          1.08      78992.15  1132.00   71976.41   
4           4  2015-11-29          1.28      51039.60   941.48   43838.39   

     4770  Total Bags  Small Bags  Large Bags  XLarge Bags          type  \
0   48.16     8696.87     8603.62       93.25          0.0  conventional   
1   58.33     9505.56     9408.07       97.49          0.0  conventional   
2  130.50     8145.35     8042.21      103.14          0.0  conventional   
3   72.58     5811.16     5677.40      133.76          0.0  conventional   
4   75.78     6183.95     5986.26      197.69          0.0  conventional   

   year  region  
0  2015  Albany  
1  2015  Alba

In [3]:
# Import necessary libraries for modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error


In [4]:
# Classification task: Predict type based on year and total volume
# Prepare the data for classification
X_classification = avocado_data[['AveragePrice', 'year', 'Total Volume']]
y_classification = avocado_data['type']

# Split the data into training and testing sets for classification
X_train_classification, X_test_classification, y_train_classification, y_test_classification = train_test_split(
    X_classification, y_classification, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier model
clf_model = RandomForestClassifier(random_state=42)

# Train the classifier model
clf_model.fit(X_train_classification, y_train_classification)

# Predict on the test set for classification
y_pred_classification = clf_model.predict(X_test_classification)

# Evaluate the classification model
accuracy_classification = accuracy_score(y_test_classification, y_pred_classification)
print("\nAccuracy on the test set for classification:", accuracy_classification)



Accuracy on the test set for classification: 0.938082191780822


In [5]:
                                            #hyperparameter tuning

In [6]:
from sklearn.model_selection import GridSearchCV

# Define the grid of hyperparameters
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest Classifier model
clf_model = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Perform GridSearchCV to find the best hyperparameters
grid_search.fit(X_train_classification, y_train_classification)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("\nBest Hyperparameters:", best_params)

# Use the best hyperparameters to train the final model
best_model = grid_search.best_estimator_
best_model.fit(X_train_classification, y_train_classification)

# Predict on the test set using the final model
y_pred_final_classification = best_model.predict(X_test_classification)

# Evaluate the final model
accuracy_final_classification = accuracy_score(y_test_classification, y_pred_final_classification)
print("\nAccuracy on the test set using the final model:", accuracy_final_classification)



Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}

Accuracy on the test set using the final model: 0.9432876712328767


In [7]:
                                   #cross vaildation

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Initialize models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42)
}

# Perform cross-validation and calculate average accuracy for each model
avg_accuracies = {}
for model_name, model in models.items():
    avg_accuracy = cross_val_score(model, X_train_classification, y_train_classification, cv=5, scoring='accuracy').mean()
    avg_accuracies[model_name] = avg_accuracy

# Print average accuracies
print("\nAverage accuracies:")
for model_name, avg_accuracy in avg_accuracies.items():
    print(f"{model_name}: {avg_accuracy}")

# Select the model with the highest average accuracy
best_model_name = max(avg_accuracies, key=avg_accuracies.get)
best_model = models[best_model_name]

print(f"\nBest model: {best_model_name}")



Average accuracies:
Random Forest: 0.9367768094721874
Decision Tree: 0.9226665868870463
Support Vector Machine: 0.8914304485961132
Logistic Regression: 0.8838284597371027

Best model: Random Forest
