In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier

### Functions used

In [44]:
def compute_accuracy(predictions, actual):
    return np.sum(predictions==actual) / float(len(actual)) * 100

# PHASE 1 - INITIAL

In [45]:
# Read the CSV file into a DataFrame
df = pd.read_csv('Book1_Split.csv')

selected_rows = df.iloc[0:125]

# Drop any columns that are not features
X = selected_rows.drop(columns=['CC_R', 'CC_NR', 'TimeTaken', 'Likert'])

# Encode categorical variables if necessary
X = pd.get_dummies(X)

# Target variable
y = selected_rows[['CC_R', 'CC_NR']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(selected_rows)
print(X)
print(y)

    C1_R C1_NR C2_R C2_NR C3_R C3_NR CC_R CC_NR  TimeTaken Likert
0      L     L    M     M    H     H    H     H       4464     CC
1      L     L    M     M    H     H    H     H       2416     CC
2      L     L    M     M    H     H    H     H       3917     CC
3      L     L    M     M    H     H    H     H       5412     CC
4      L     L    M     M    H     H    H     H      10412     CC
..   ...   ...  ...   ...  ...   ...  ...   ...        ...    ...
120    M     H    L     M    H     L    H     L       7774     FC
121    M     H    L     M    H     L    M     H      11651     FC
122    M     H    L     M    H     L    L     M       7832     CC
123    M     H    L     M    H     L    H     L      15142     FC
124    M     H    L     M    H     L    H     L       8463     CC

[125 rows x 10 columns]
     C1_R_H  C1_R_L  C1_R_M  C1_NR_H  C1_NR_L  C2_R_L  C2_R_M  C2_NR_H  \
0         0       1       0        0        1       0       1        0   
1         0       1       0        

# Decision Tree for Phase 1

In [46]:
# Initialize the decision tree classifier
dtc = DecisionTreeClassifier()

# Wrap the classifier with MultiOutputClassifier
multi_output_dtc = MultiOutputClassifier(dtc)


# Define hyperparameters for the decision tree
hyperparameters = {
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__max_depth': [5, 10, 20, 30],
    'estimator__min_samples_split': [2, 4, 6, 10, 15, 20],
    'estimator__max_leaf_nodes': [3, 5, 10, 20, 50, 100]
}

# Configure RandomizedSearchCV
rsc = RandomizedSearchCV(estimator=multi_output_dtc, param_distributions=hyperparameters, n_iter=50, cv=5, random_state=13)
rsc.fit(X_train, y_train)

# Results and best model
best_params = rsc.best_params_
best_model = rsc.best_estimator_

# Predict with the best model
predictions = best_model.predict(X_test)

In [47]:
print("Test accuracy for PHASE 1 is : ", compute_accuracy(predictions, y_test))

Test accuracy for PHASE 1 is :  CC_R     60.0
CC_NR    60.0
dtype: float64


# PHASE 2 - Rebuild using Star Scheme

In [48]:
selected_rows = df.iloc[125:250]

# Drop any columns that are not features
X = selected_rows.drop(columns=['CC_R', 'CC_NR', 'TimeTaken', 'Likert'])

# Encode categorical variables if necessary
X = pd.get_dummies(X)

# Target variable
y = selected_rows[['CC_R', 'CC_NR']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(selected_rows)
print(X)
print(y)

    C1_R C1_NR C2_R C2_NR C3_R C3_NR CC_R CC_NR  TimeTaken Likert
125    L     L    H     H    M     M    M     M       4752     CC
126    L     L    H     H    M     M    H     H       2385     CC
127    L     L    H     H    M     M    H     H      17683     FC
128    L     L    H     H    M     M    H     H       9321     CC
129    L     L    H     H    M     M    H     H      10708     CC
..   ...   ...  ...   ...  ...   ...  ...   ...        ...    ...
245    L     M    M     H    H     L    H     L       4548     CC
246    L     M    M     H    H     L    H     L      15834     CC
247    L     M    M     H    H     L    M     H       8578     CC
248    L     M    M     H    H     L    M     H       4324     FC
249    L     M    M     H    H     L    H     L       9539     FC

[125 rows x 10 columns]
     C1_R_H  C1_R_L  C1_NR_L  C1_NR_M  C2_R_H  C2_R_M  C2_NR_H  C2_NR_M  \
125       0       1        1        0       1       0        1        0   
126       0       1        1     

# Decision Tree for Phase 2

In [49]:
# Initialize the decision tree classifier
dtc = DecisionTreeClassifier()

# Wrap the classifier with MultiOutputClassifier
multi_output_dtc = MultiOutputClassifier(dtc)


# Define hyperparameters for the decision tree
hyperparameters = {
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__max_depth': [5, 10, 20, 30],
    'estimator__min_samples_split': [2, 4, 6, 10, 15, 20],
    'estimator__max_leaf_nodes': [3, 5, 10, 20, 50, 100]
}

# Configure RandomizedSearchCV
rsc = RandomizedSearchCV(estimator=multi_output_dtc, param_distributions=hyperparameters, n_iter=50, cv=5, random_state=13)
rsc.fit(X_train, y_train)

# Results and best model
best_params = rsc.best_params_
best_model = rsc.best_estimator_

# Predict with the best model
predictions = best_model.predict(X_test)

In [50]:
print("Test accuracy for PHASE 2 is : ", compute_accuracy(predictions, y_test))

Test accuracy for PHASE 2 is :  CC_R     64.0
CC_NR    64.0
dtype: float64


# PHASE 3 - Rebuild using Like Scheme

In [51]:
selected_rows = df.iloc[250:375]

# Drop any columns that are not features
X = selected_rows.drop(columns=['CC_R', 'CC_NR', 'TimeTaken', 'Likert'])

# Encode categorical variables if necessary
X = pd.get_dummies(X)

# Target variable
y = selected_rows[['CC_R', 'CC_NR']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(selected_rows)
print(X)
print(y)

    C1_R C1_NR C2_R C2_NR C3_R C3_NR CC_R CC_NR  TimeTaken Likert
250    L     L    H     H    M     M    M     M       5041     CC
251    L     L    H     H    M     M    H     H       2992     CC
252    L     L    H     H    M     M    H     H       9236     CC
253    L     L    H     H    M     M    H     H      10289     CC
254    L     L    H     H    M     M    H     H       8219     FC
..   ...   ...  ...   ...  ...   ...  ...   ...        ...    ...
370    L     M    M     H    H     L    H     L       7775     CC
371    L     M    M     H    H     L    M     H      12559     CC
372    L     M    M     H    H     L    M     H       7128     CC
373    L     M    M     H    H     L    M     H       9269     FC
374    L     M    M     H    H     L    M     H      11454     FC

[125 rows x 10 columns]
     C1_R_H  C1_R_L  C1_NR_L  C1_NR_M  C2_R_H  C2_R_M  C2_NR_H  C2_NR_M  \
250       0       1        1        0       1       0        1        0   
251       0       1        1     

# Decision Tree for Phase 3

In [52]:
# Initialize the decision tree classifier
dtc = DecisionTreeClassifier()

# Wrap the classifier with MultiOutputClassifier
multi_output_dtc = MultiOutputClassifier(dtc)


# Define hyperparameters for the decision tree
hyperparameters = {
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__max_depth': [5, 10, 20, 30],
    'estimator__min_samples_split': [2, 4, 6, 10, 15, 20],
    'estimator__max_leaf_nodes': [3, 5, 10, 20, 50, 100]
}

# Configure RandomizedSearchCV
rsc = RandomizedSearchCV(estimator=multi_output_dtc, param_distributions=hyperparameters, n_iter=50, cv=5, random_state=13)
rsc.fit(X_train, y_train)

# Results and best model
best_params = rsc.best_params_
best_model = rsc.best_estimator_

# Predict with the best model
predictions = best_model.predict(X_test)

In [53]:
print("Test accuracy for PHASE 3 is : ", compute_accuracy(predictions, y_test))

Test accuracy for PHASE 3 is :  CC_R     64.0
CC_NR    64.0
dtype: float64


# PHASE 4 - Rebuild using Upvote Scheme

In [54]:
selected_rows = df.iloc[375:500]

# Drop any columns that are not features
X = selected_rows.drop(columns=['CC_R', 'CC_NR', 'TimeTaken', 'Likert'])

# Encode categorical variables if necessary
X = pd.get_dummies(X)

# Target variable
y = selected_rows[['CC_R', 'CC_NR']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(selected_rows)
print(X)
print(y)

    C1_R C1_NR C2_R C2_NR C3_R C3_NR CC_R CC_NR  TimeTaken Likert
375    L     L    H     H    M     M    M     M       6889     CC
376    L     L    H     H    M     M    H     H       4919     CC
377    L     L    H     H    M     M    H     H       6173     FC
378    L     L    H     H    M     M    H     H      11576     CC
379    L     L    H     H    M     M    H     H       4983     CC
..   ...   ...  ...   ...  ...   ...  ...   ...        ...    ...
495    L     M    M     H    H     L    M     H       9058     SC
496    L     M    M     H    H     L    H     L      10601     CC
497    L     M    M     H    H     L    M     H       7276     CC
498    L     M    M     H    H     L    M     H      40409     FC
499    L     M    M     H    H     L    L     M      13591     SC

[125 rows x 10 columns]
     C1_R_H  C1_R_L  C1_NR_L  C1_NR_M  C2_R_H  C2_R_M  C2_NR_H  C2_NR_M  \
375       0       1        1        0       1       0        1        0   
376       0       1        1     

# Decision Tree for Phase 4

In [55]:
# Initialize the decision tree classifier
dtc = DecisionTreeClassifier()

# Wrap the classifier with MultiOutputClassifier
multi_output_dtc = MultiOutputClassifier(dtc)


# Define hyperparameters for the decision tree
hyperparameters = {
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__max_depth': [5, 10, 20, 30],
    'estimator__min_samples_split': [2, 4, 6, 10, 15, 20],
    'estimator__max_leaf_nodes': [3, 5, 10, 20, 50, 100]
}

# Configure RandomizedSearchCV
rsc = RandomizedSearchCV(estimator=multi_output_dtc, param_distributions=hyperparameters, n_iter=50, cv=5, random_state=13)
rsc.fit(X_train, y_train)

# Results and best model
best_params = rsc.best_params_
best_model = rsc.best_estimator_

# Predict with the best model
predictions = best_model.predict(X_test)

In [56]:
print("Test accuracy for PHASE 4 is : ", compute_accuracy(predictions, y_test))

Test accuracy for PHASE 4 is :  CC_R     64.0
CC_NR    64.0
dtype: float64
