In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_absolute_error

# Load the dataset from the URL
url = "https://github.com/dsrscientist/Data-Science-ML-Capstone-Projects/raw/master/avocado.csv"
avocado_data = pd.read_csv(url)

# Check for missing values
print(avocado_data.isnull().sum())

# Drop rows with missing values
avocado_data.dropna(inplace=True)

# Preprocess the 'Date' column
avocado_data['Date'] = pd.to_datetime(avocado_data['Date'], format='%d-%m-%Y')  # Update date format

# Encode categorical variable 'type'
label_encoder = LabelEncoder()
avocado_data['type'] = label_encoder.fit_transform(avocado_data['type'])

# Split data for classification task
X_class = avocado_data[['AveragePrice', 'Total Volume', '4046', '4225', '4770', 'type', 'year']]
y_class = avocado_data['region']
X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

# Split data for regression task
X_reg = avocado_data[['AveragePrice', 'Total Volume', '4046', '4225', '4770', 'type', 'year']]
y_reg = avocado_data['Total Bags']
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Classification Task
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_class_train, y_class_train)
y_class_pred = classifier.predict(X_class_test)
classification_accuracy = accuracy_score(y_class_test, y_class_pred)
print("Random Forest Classification Accuracy:", classification_accuracy)

# Regression Task
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_reg_train, y_reg_train)
y_reg_pred = regressor.predict(X_reg_test)
regression_mae = mean_absolute_error(y_reg_test, y_reg_pred)
print("Random Forest Regression Mean Absolute Error:", regression_mae)

# Linear Regression for AveragePrice
linear_regression = LinearRegression()
linear_regression.fit(X_reg_train, y_reg_train)
regression_score = linear_regression.score(X_reg_test, y_reg_test)
print("Linear Regression Score for AveragePrice:", regression_score)

# Logistic Regression for Region
logistic_regression = LogisticRegression(solver='liblinear', max_iter=1000)
logistic_regression.fit(X_class_train, y_class_train)
y_class_pred = logistic_regression.predict(X_class_test)
classification_accuracy = accuracy_score(y_class_test, y_class_pred)
print("Logistic Regression Classification Accuracy for Region:", classification_accuracy)

Unnamed: 0      14951
Date            14951
AveragePrice    14951
Total Volume    14951
4046            14951
4225            14951
4770            14951
Total Bags      14951
Small Bags      14951
Large Bags      14951
XLarge Bags     14951
type            14951
year            14951
region          14951
dtype: int64
Random Forest Classification Accuracy: 0.9210526315789473
Random Forest Regression Mean Absolute Error: 29403.45994276316
Linear Regression Score for AveragePrice: 1.0
Logistic Regression Classification Accuracy for Region: 0.6578947368421053
