In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier


# Step 1: Load the training data
train_url = ("https://github.com/dustywhite7/Econ8310/raw/master/AssignmentData/assignment3.csv")
data = pd.read_csv(train_url)

# Step 2: Define features (X) and target (y)
X = data.drop(columns=["meal"])  # Drop the target variable
y = data["meal"]  # Define the target variable

# Check and preprocess the data
# Identify categorical columns
categorical_columns = X.select_dtypes(include=["object"]).columns

# Apply one-hot encoding to categorical columns
X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# Step 3: Train the Random Forest model
model = RandomForestClassifier(n_estimators= 500, max_depth = 30, min_samples_split = 5, min_samples_leaf= 2)  # Instantiate the model
modelFit = model.fit(X_train, y_train)  # Fit the model to training data

# Step 4: Evaluate the model
y_pred_val = modelFit.predict(X_val)  # Generate predictions on validation data
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))  # Print accuracy

# Step 5: Load the test data
test_url = "https://github.com/dustywhite7/Econ8310/raw/master/AssignmentData/assignment3test.csv"
test_data = pd.read_csv(test_url)

# Align test data with training data columns
test_data_aligned = pd.get_dummies(test_data)
test_data_aligned = test_data_aligned.reindex(columns=X_train.columns, fill_value=0)

# Generate predictions
predict = modelFit.predict(test_data_aligned)

# Save predictions as a series
pred = pd.Series(modelFit.predict_proba(test_data_aligned)[:, 1], name="Predictions")
print(pred)  # Output predictions




Validation Accuracy: 0.8225123500352858
0      0.215180
1      0.160685
2      0.160155
3      0.146584
4      0.204739
         ...   
995    0.167091
996    0.143964
997    0.163334
998    0.160873
999    0.164538
Name: Predictions, Length: 1000, dtype: float64
