In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [3]:
data = pd.read_csv('Assignment 4-blues_hand.csv')

In [5]:
print(data.head())

               name state  brthYr  post1906  region  handPost  thumbSty
0      Henry Thomas    TX    1874         0       3         1         3
1      Frank Stokes    TN    1887         0       2         1         3
2       Sam Collins    MS    1887         0       2         1         2
3    Peg Leg Howell    GA    1888         0       1         2         2
4  Huddie Ledbetter    TX    1888         0       3         2         3


In [9]:
# Features without region
X = data[['handPost', 'thumbSty']] 
y = data['post1906']  # Target variable (0 or 1 for birth year)

# Features with region
X_with_region = data[['handPost', 'thumbSty', 'region']]

In [11]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model
dt_classifier.fit(X_train, y_train)

# Predict the test set
y_pred = dt_classifier.predict(X_test)

# Calculate accuracy
accuracy_no_region = accuracy_score(y_test, y_pred)
print(f"Decision Tree Accuracy without Region: {accuracy_no_region:.4f}")

Decision Tree Accuracy without Region: 0.6316


In [13]:
# Split the dataset into training and testing sets (80% train, 20% test) with Region
X_train_region, X_test_region, y_train, y_test = train_test_split(X_with_region, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree classifier
dt_classifier_region = DecisionTreeClassifier(random_state=42)

# Train the model
dt_classifier_region.fit(X_train_region, y_train)

# Predict the test set
y_pred_region = dt_classifier_region.predict(X_test_region)

# Calculate accuracy
accuracy_with_region = accuracy_score(y_test, y_pred_region)
print(f"Decision Tree Accuracy with Region: {accuracy_with_region:.4f}")

Decision Tree Accuracy with Region: 0.4737


In [15]:
# Initialize the Random Forest classifier
rf_classifier_region = RandomForestClassifier(random_state=42)

# Train the model
rf_classifier_region.fit(X_train_region, y_train)

# Predict the test set
y_pred_rf_region = rf_classifier_region.predict(X_test_region)

# Calculate accuracy
accuracy_rf_with_region = accuracy_score(y_test, y_pred_rf_region)
print(f"Random Forest Accuracy with Region: {accuracy_rf_with_region:.4f}")

Random Forest Accuracy with Region: 0.4211


In [19]:
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
# Function to run the model multiple times and calculate average accuracy and range
def run_model_multiple_times(model, X, y, n_runs=10):
    accuracies = []
    for _ in range(n_runs):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.randint(1000))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracies.append(accuracy_score(y_test, y_pred))
    return np.mean(accuracies), np.min(accuracies), np.max(accuracies)

# Running decision tree without region
mean_acc, min_acc, max_acc = run_model_multiple_times(dt_classifier, X, y)
print(f"Decision Tree without Region - Avg: {mean_acc:.4f}, Min: {min_acc:.4f}, Max: {max_acc:.4f}")

# Running decision tree with region
mean_acc_region, min_acc_region, max_acc_region = run_model_multiple_times(dt_classifier_region, X_with_region, y)
print(f"Decision Tree with Region - Avg: {mean_acc_region:.4f}, Min: {min_acc_region:.4f}, Max: {max_acc_region:.4f}")

# Running random forest without region
mean_acc_rf, min_acc_rf, max_acc_rf = run_model_multiple_times(rf_classifier, X, y)
print(f"Random Forest without Region - Avg: {mean_acc_rf:.4f}, Min: {min_acc_rf:.4f}, Max: {max_acc_rf:.4f}")

# Running random forest with region
mean_acc_rf_region, min_acc_rf_region, max_acc_rf_region = run_model_multiple_times(rf_classifier_region, X_with_region, y)
print(f"Random Forest with Region - Avg: {mean_acc_rf_region:.4f}, Min: {min_acc_rf_region:.4f}, Max: {max_acc_rf_region:.4f}")

Decision Tree without Region - Avg: 0.5684, Min: 0.3684, Max: 0.6842
Decision Tree with Region - Avg: 0.5316, Min: 0.3684, Max: 0.6316
Random Forest without Region - Avg: 0.5684, Min: 0.4211, Max: 0.6842
Random Forest with Region - Avg: 0.5368, Min: 0.3684, Max: 0.6842
