In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

### Functions used

In [2]:
def compute_accuracy(predictions, actual):
    return np.sum(predictions==actual) / len(actual) * 100

# PHASE 1 - INITIAL

In [3]:
# Read the CSV file into a DataFrame
df = pd.read_csv('Book1.csv')

selected_rows = df.iloc[0:125]

# Drop any columns that are not features
X = selected_rows.drop(columns=['Chosen', 'TimeTaken', 'Likert'])

# Encode categorical variables if necessary
X = pd.get_dummies(X)

# Target variable
y = selected_rows['Chosen']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(selected_rows)
print(X)
print(y)

    Choice1 Choice2 Choice3 Chosen  TimeTaken Likert
0        LL      MM      HH     HH       4464     CC
1        LL      MM      HH     HH       2416     CC
2        LL      MM      HH     HH       3917     CC
3        LL      MM      HH     HH       5412     CC
4        LL      MM      HH     HH      10412     CC
..      ...     ...     ...    ...        ...    ...
120      MH      LM      HL     HL       7774     FC
121      MH      LM      HL     MH      11651     FC
122      MH      LM      HL     LM       7832     CC
123      MH      LM      HL     HL      15142     FC
124      MH      LM      HL     HL       8463     CC

[125 rows x 6 columns]
     Choice1_HL  Choice1_LL  Choice1_MH  Choice2_LH  Choice2_LM  Choice2_LM   \
0             0           1           0           0           0            0   
1             0           1           0           0           0            0   
2             0           1           0           0           0            0   
3             0     

# Decision Tree for Phase 1

In [4]:
# Initialize and train the decision tree classifier
dtc = DecisionTreeClassifier()

dtc.get_params().keys()

hyperparameters = [
    {
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 10, 20, 30],
        'min_samples_split': [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes': [3, 5, 10, 20, 50, 100]
    }
]

rsc = RandomizedSearchCV(estimator=dtc,param_distributions=hyperparameters,n_iter=50,cv=5,random_state=13)
rsc.fit(X_train, y_train)

pd.set_option('display.max_colwidth', None)

rsc_results = pd.DataFrame(rsc.cv_results_)

best_index = rsc.best_index_

rsc_results.loc[best_index]

best_acc = rsc.best_score_

predictions = rsc.best_estimator_.predict(X_test)



In [5]:
print("Test accuracy for PHASE 1 is : ", compute_accuracy(predictions, y_test), "%")

Test accuracy for PHASE 1 is :  60.0 %


# PHASE 2 - Rebuild using Star Scheme

In [6]:
selected_rows = df.iloc[125:250]

# Drop any columns that are not features
X = selected_rows.drop(columns=['Chosen', 'TimeTaken', 'Likert'])

# Encode categorical variables if necessary
X = pd.get_dummies(X)

# Target variable
y = selected_rows['Chosen']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(selected_rows)
print(X)
print(y)

    Choice1 Choice2 Choice3 Chosen  TimeTaken Likert
125      LL      HH      MM     MM       4752     CC
126      LL      HH      MM     HH       2385     CC
127      LL      HH      MM     HH      17683     FC
128      LL      HH      MM     HH       9321     CC
129      LL      HH      MM     HH      10708     CC
..      ...     ...     ...    ...        ...    ...
245      LM      MH      HL     HL       4548     CC
246      LM      MH      HL     HL      15834     CC
247      LM      MH      HL     MH       8578     CC
248      LM      MH      HL     MH       4324     FC
249      LM      MH      HL     HL       9539     FC

[125 rows x 6 columns]
     Choice1_HL  Choice1_LL  Choice1_LM  Choice2_HH  Choice2_MH  Choice2_MM  \
125           0           1           0           1           0           0   
126           0           1           0           1           0           0   
127           0           1           0           1           0           0   
128           0         

# Decision Tree for Phase 2

In [7]:
# Initialize and train the decision tree classifier
dtc = DecisionTreeClassifier()

dtc.get_params().keys()

hyperparameters = [
    {
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 10, 20, 30],
        'min_samples_split': [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes': [3, 5, 10, 20, 50, 100]
    }
]

rsc = RandomizedSearchCV(estimator=dtc,param_distributions=hyperparameters,n_iter=50,cv=5,random_state=13)
rsc.fit(X_train, y_train)

pd.set_option('display.max_colwidth', None)

rsc_results = pd.DataFrame(rsc.cv_results_)

best_index = rsc.best_index_

rsc_results.loc[best_index]

best_acc = rsc.best_score_

predictions = rsc.best_estimator_.predict(X_test)



In [8]:
print("Test accuracy for PHASE 2 is : ", compute_accuracy(predictions, y_test), "%")

Test accuracy for PHASE 2 is :  64.0 %


# PHASE 3 - Rebuild using Like Scheme

In [9]:
selected_rows = df.iloc[250:375]

# Drop any columns that are not features
X = selected_rows.drop(columns=['Chosen', 'TimeTaken', 'Likert'])

# Encode categorical variables if necessary
X = pd.get_dummies(X)

# Target variable
y = selected_rows['Chosen']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(selected_rows)
print(X)
print(y)

    Choice1 Choice2 Choice3 Chosen  TimeTaken Likert
250      LL      HH      MM     MM       5041     CC
251      LL      HH      MM     HH       2992     CC
252      LL      HH      MM     HH       9236     CC
253      LL      HH      MM     HH      10289     CC
254      LL      HH      MM     HH       8219     FC
..      ...     ...     ...    ...        ...    ...
370      LM      MH      HL     HL       7775     CC
371      LM      MH      HL     MH      12559     CC
372      LM      MH      HL     MH       7128     CC
373      LM      MH      HL     MH       9269     FC
374      LM      MH      HL     MH      11454     FC

[125 rows x 6 columns]
     Choice1_HL  Choice1_LL  Choice1_LM  Choice2_HH  Choice2_MH  Choice2_MM  \
250           0           1           0           1           0           0   
251           0           1           0           1           0           0   
252           0           1           0           1           0           0   
253           0         

# Decision Tree for Phase 3

In [10]:
# Initialize and train the decision tree classifier
dtc = DecisionTreeClassifier()

dtc.get_params().keys()

hyperparameters = [
    {
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 10, 20, 30],
        'min_samples_split': [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes': [3, 5, 10, 20, 50, 100]
    }
]

rsc = RandomizedSearchCV(estimator=dtc,param_distributions=hyperparameters,n_iter=50,cv=5,random_state=13)
rsc.fit(X_train, y_train)

pd.set_option('display.max_colwidth', None)

rsc_results = pd.DataFrame(rsc.cv_results_)

best_index = rsc.best_index_

rsc_results.loc[best_index]

best_acc = rsc.best_score_

predictions = rsc.best_estimator_.predict(X_test)



In [11]:
print("Test accuracy for PHASE 3 is : ", compute_accuracy(predictions, y_test), "%")

Test accuracy for PHASE 3 is :  64.0 %


# PHASE 4 - Rebuild using Upvote Scheme

In [12]:
selected_rows = df.iloc[375:500]

# Drop any columns that are not features
X = selected_rows.drop(columns=['Chosen', 'TimeTaken', 'Likert'])

# Encode categorical variables if necessary
X = pd.get_dummies(X)

# Target variable
y = selected_rows['Chosen']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(selected_rows)
print(X)
print(y)

    Choice1 Choice2 Choice3 Chosen  TimeTaken Likert
375      LL      HH      MM     MM       6889     CC
376      LL      HH      MM     HH       4919     CC
377      LL      HH      MM     HH       6173     FC
378      LL      HH      MM     HH      11576     CC
379      LL      HH      MM     HH       4983     CC
..      ...     ...     ...    ...        ...    ...
495      LM      MH      HL     MH       9058     SC
496      LM      MH      HL     HL      10601     CC
497      LM      MH      HL     MH       7276     CC
498      LM      MH      HL     MH      40409     FC
499      LM      MH      HL     LM      13591     SC

[125 rows x 6 columns]
     Choice1_HL  Choice1_LL  Choice1_LM  Choice2_HH  Choice2_MH  Choice2_MM  \
375           0           1           0           1           0           0   
376           0           1           0           1           0           0   
377           0           1           0           1           0           0   
378           0         

# Decision Tree for Phase 4

In [13]:
# Initialize and train the decision tree classifier
dtc = DecisionTreeClassifier()

dtc.get_params().keys()

hyperparameters = [
    {
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 10, 20, 30],
        'min_samples_split': [2, 4, 6, 10, 15, 20],
        'max_leaf_nodes': [3, 5, 10, 20, 50, 100]
    }
]

rsc = RandomizedSearchCV(estimator=dtc,param_distributions=hyperparameters,n_iter=50,cv=5,random_state=13)
rsc.fit(X_train, y_train)

pd.set_option('display.max_colwidth', None)

rsc_results = pd.DataFrame(rsc.cv_results_)

best_index = rsc.best_index_

rsc_results.loc[best_index]

best_acc = rsc.best_score_

predictions = rsc.best_estimator_.predict(X_test)

In [14]:
print("Test accuracy for PHASE 4 is : ", compute_accuracy(predictions, y_test), "%")

Test accuracy for PHASE 4 is :  64.0 %
