In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score
from plotnine import *
from sklearn.preprocessing import OneHotEncoder, label_binarize, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC

# Part One: Binary Classification

In [None]:
data = pd.read_csv("/content/cannabis_full.csv")
data = data.dropna()

In [None]:
sativa_indica_data = data[data['Type'].isin(['sativa', 'indica'])]

In [None]:
sativa_indica_data = sativa_indica_data.dropna()

In [None]:
columns_to_encode = sativa_indica_data.loc[:, 'Creative':'Mouth'].columns.tolist()

In [None]:
enc = OneHotEncoder(handle_unknown = 'ignore')

ct = ColumnTransformer(
    [('dummify', enc, columns_to_encode)],
    remainder='passthrough'
)

In [None]:
X = sativa_indica_data.drop(['Type', 'Strain', 'Effects', 'Flavor'], axis=1)
y = sativa_indica_data['Type']

### LDA

In [None]:
lda_pipeline = Pipeline(
    [('dummify',ct), ('lda', LinearDiscriminantAnalysis())]
)

In [None]:
ldapredict = lda_pipeline.fit(X, y)

In [None]:
ypreds = ldapredict.predict(X)
ypreds

array(['sativa', 'sativa', 'indica', ..., 'indica', 'indica', 'indica'],
      dtype='<U6')

In [None]:
accuracy = accuracy_score(y, ypreds)
accuracy

0.8694096601073346

In [None]:
lda_cv_scores = cross_val_score(lda_pipeline, X, y, cv=5)
print(lda_cv_scores)

lda_mean_accuracy = lda_cv_scores.mean()
print(lda_mean_accuracy)

[0.84821429 0.84821429 0.82589286 0.84753363 0.84304933]
0.8425808776425369


In [None]:
confusion_matrix(y, ypreds)

array([[627,  60],
       [ 86, 345]])

### QDA

In [None]:
qda_pipeline = Pipeline([
    ('dummify', ct), ('qda', QuadraticDiscriminantAnalysis())
])

In [None]:
qdapredict = qda_pipeline.fit(X, y)



In [None]:
ypreds_qda = qdapredict.predict(X)
ypreds_qda

array(['sativa', 'indica', 'sativa', ..., 'indica', 'sativa', 'sativa'],
      dtype=object)

In [None]:
accuracy = accuracy_score(y, ypreds_qda)
accuracy

0.5858676207513417

In [None]:
qda_cv_scores = cross_val_score(qda_pipeline, X, y, cv=5)
print(qda_cv_scores)

qda_mean_accuracy = qda_cv_scores.mean()
print(qda_mean_accuracy)



[0.67857143 0.61607143 0.55803571 0.53811659 0.70852018]
0.619863068545804


In [None]:
confusion_matrix(y, ypreds_qda)

array([[244, 443],
       [ 20, 411]])

### SVC

In [None]:
svc_pipeline = Pipeline([
    ('dummify', ct), ('svec', SVC(kernel = 'linear'))
])

In [None]:
svcpredict = svc_pipeline.fit(X, y)

In [None]:
ypreds_svc = svcpredict.predict(X)
ypreds_svc

array(['sativa', 'indica', 'indica', ..., 'indica', 'indica', 'indica'],
      dtype=object)

In [None]:
accuracy = accuracy_score(y, ypreds_svc)
accuracy

0.8792486583184258

In [None]:
svc_cv_scores = cross_val_score(svc_pipeline, X, y, cv=5)
print(svc_cv_scores)

svc_mean_accuracy = svc_cv_scores.mean()
print(svc_mean_accuracy)

[0.85267857 0.85714286 0.82589286 0.83856502 0.85650224]
0.8461563100576553


In [None]:
confusion_matrix(y, ypreds_svc)

array([[632,  55],
       [ 80, 351]])

### SVM

In [None]:
svm_pipeline = Pipeline([
    ('dummify', ct),
    ('svm', SVC(kernel='poly'))
])

svm_pipeline.fit(X, y)

ypreds_svm = svm_pipeline.predict(X)

accuracy_svm = accuracy_score(y, ypreds_svm)

conf_matrix_svm = confusion_matrix(y, ypreds_svm)

svm_cv_scores = cross_val_score(svm_pipeline, X, y, cv=5)
print(svm_cv_scores)

print(f"SVM Accuracy: {accuracy_svm}")
print("SVM Confusion Matrix:")
print(conf_matrix_svm)


[0.85267857 0.85267857 0.84375    0.84304933 0.88789238]
SVM Accuracy: 0.8962432915921288
SVM Confusion Matrix:
[[643  44]
 [ 72 359]]


# Part Two: Natural Multiclass


In [None]:
X = data.drop(['Type', 'Strain', 'Effects', 'Flavor'], axis=1)
y = data['Type']

### Q1
Fit a decision tree, plot the final fit, and interpret the results.

In [None]:
dt_pipeline = Pipeline([
    ('dummify', ct),
    ('dt', DecisionTreeClassifier(max_depth=5))
])

dt_pipeline.fit(X, y)

ypreds_dt = dt_pipeline.predict(X)

accuracy_dt = accuracy_score(y, ypreds_dt)
conf_matrix_dt = confusion_matrix(y, ypreds_dt)

print(f"Decision Tree Accuracy: {accuracy_dt}")
print("Decision Tree Confusion Matrix:")
print(conf_matrix_dt)


Decision Tree Accuracy: 0.6559652928416486
Decision Tree Confusion Matrix:
[[883 194 110]
 [221 455  11]
 [236  21 174]]


In [None]:
dt_cv_scores = cross_val_score(dt_pipeline, X, y, cv=5)
print(dt_cv_scores)

[0.59652928 0.60954447 0.59869848 0.62906725 0.62255965]


This decision tree can best predict between the 3 types at approximately 60% with the current model.

### Q2
Repeat the analyses from Part One for LDA, QDA, and KNN.

LDA

In [None]:
lda_pipeline = Pipeline(
    [('dummify',ct), ('lda', LinearDiscriminantAnalysis())]
)

In [None]:
ldapredict = lda_pipeline.fit(X, y)

In [None]:
ypreds = ldapredict.predict(X)
ypreds

array(['hybrid', 'hybrid', 'hybrid', ..., 'indica', 'indica', 'indica'],
      dtype='<U6')

In [None]:
accuracy = accuracy_score(y, ypreds)
accuracy

0.6429501084598699

In [None]:
lda_cv_scores = cross_val_score(lda_pipeline, X, y, cv=5)
print(lda_cv_scores)

lda_mean_accuracy = lda_cv_scores.mean()
print(lda_mean_accuracy)

[0.61388286 0.62689805 0.61822126 0.64859002 0.63774403]
0.6290672451193059


In [None]:
confusion_matrix(y, ypreds)

array([[829, 211, 147],
       [211, 467,   9],
       [224,  21, 186]])

QDA

In [None]:
qda_pipeline = Pipeline([
    ('dummify', ct), ('qda', QuadraticDiscriminantAnalysis())
])

In [None]:
qdapredict = qda_pipeline.fit(X, y)



In [None]:
ypreds_qda = qdapredict.predict(X)
ypreds_qda

array(['sativa', 'hybrid', 'hybrid', ..., 'indica', 'sativa', 'sativa'],
      dtype=object)

In [None]:
accuracy = accuracy_score(y, ypreds_qda)
accuracy

0.3110629067245119

In [None]:
qda_cv_scores = cross_val_score(qda_pipeline, X, y, cv=5)
print(qda_cv_scores)

qda_mean_accuracy = qda_cv_scores.mean()
print(qda_mean_accuracy)



[0.37093275 0.32321041 0.32321041 0.35357918 0.39479393]
0.35314533622559646


In [None]:
confusion_matrix(y, ypreds_qda)

array([[207, 109, 871],
       [ 90, 144, 453],
       [ 54,  11, 366]])

KNN

In [None]:
knn_pipeline = Pipeline(
    [('dummify',ct), ('knc', KNeighborsClassifier(n_neighbors = 3))]
)

In [None]:
knnpredict = knn_pipeline.fit(X, y)

In [None]:
ypreds_knn = knnpredict.predict(X)
ypreds_knn

array(['hybrid', 'hybrid', 'hybrid', ..., 'indica', 'indica', 'indica'],
      dtype=object)

In [None]:
accuracy = accuracy_score(y, ypreds_knn)
accuracy

0.7557483731019523

In [None]:
knn_cv_scores = cross_val_score(knn_pipeline, X, y, cv=5)
print(knn_cv_scores)

knn_mean_accuracy = knn_cv_scores.mean()
print(knn_mean_accuracy)

[0.59436009 0.58785249 0.54880694 0.56399132 0.57483731]
0.5739696312364425


In [None]:
confusion_matrix(y, ypreds_knn)

array([[983, 119,  85],
       [171, 508,   8],
       [160,  20, 251]])

### Q3
Were your metrics better or worse than in Part One? Why? Which categories were most likely to get mixed up, according to the confusion matrices? Why?

It appears as though both QDA and LDA performed much worse than in Part One, meanwhile KNN did relatively well. It was unable to correct distinguish hybrid strains of cannabis. According to the confusion matrix, QDA & LDA it incorrectly identifed some of orignal strains as hybrid, and also incorrectly guessed hybrid as indica or sativa

# Part Three: Multiclass from Binary
Consider two models designed for binary classification: SVC and Logistic Regression.



### Q1
Fit and report metrics for OvR versions of the models. That is, for each of the two model types, create three models:

Indica vs. Not Indica

Sativa vs. Not Sativa

Hybrid vs. Not Hybrid

Logistic Regression

In [None]:
data['Indica'] = data['Type'] == "indica"
data['Sativa'] = data['Type'] == "sativa"
data['Hybrid'] = data['Type'] == "hybrid"

In [None]:
X = data.drop(['Type', 'Strain', 'Effects', 'Flavor', 'Indica', 'Sativa', 'Hybrid'], axis=1)
y = data['Indica']

In [None]:
log_pipeline = Pipeline(
    [('dummify',ct), ('lg', LogisticRegression())]
)

In [None]:
logpredict = log_pipeline.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
ypreds_log = logpredict.predict(X)
ypreds_log

array([False, False, False, ...,  True,  True,  True])

In [None]:
accuracy = accuracy_score(y, ypreds_log)
accuracy

0.8082429501084599

In [None]:
confusion_matrix(y, ypreds_log)

array([[1434,  184],
       [ 258,  429]])

In [None]:
f1 = f1_score(y, ypreds_log)
f1

0.66

In [None]:
y = data['Sativa']

In [None]:
logpredict = log_pipeline.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
ypreds_log = logpredict.predict(X)
ypreds_log

array([False, False, False, ..., False, False, False])

In [None]:
accuracy = accuracy_score(y, ypreds_log)
accuracy

0.8351409978308026

In [None]:
confusion_matrix(y, ypreds_log)

array([[1784,   90],
       [ 290,  141]])

In [None]:
f1 = f1_score(y, ypreds_log)
f1

0.4259818731117824

In [None]:
y = data['Hybrid']

In [None]:
logpredict = log_pipeline.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
ypreds_log = logpredict.predict(X)
ypreds_log

array([ True,  True,  True, ..., False, False, False])

In [None]:
accuracy = accuracy_score(y, ypreds_log)
accuracy

0.6477223427331887

In [None]:
confusion_matrix(y, ypreds_log)

array([[649, 469],
       [343, 844]])

In [None]:
f1 = f1_score(y, ypreds_log)
f1

0.6752000000000001

SVC

In [None]:
y = data['Indica']

In [None]:
svcpredict = svc_pipeline.fit(X, y)

In [None]:
ypreds_svc = svcpredict.predict(X)
ypreds_svc

array([False, False, False, ...,  True,  True,  True])

In [None]:
accuracy = accuracy_score(y, ypreds_svc)
accuracy

0.7908893709327549

In [None]:
confusion_matrix(y, ypreds_svc)

array([[1362,  256],
       [ 226,  461]])

In [None]:
f1 = f1_score(y, ypreds_svc)
f1

0.6566951566951567

In [None]:
y = data['Sativa']

In [None]:
svcpredict = svc_pipeline.fit(X, y)

In [None]:
ypreds_svc = svcpredict.predict(X)
ypreds_svc

array([False, False, False, ..., False, False, False])

In [None]:
accuracy = accuracy_score(y, ypreds_svc)
accuracy

0.8147505422993493

In [None]:
confusion_matrix(y, ypreds_svc)

array([[1874,    0],
       [ 427,    4]])

In [None]:
f1 = f1_score(y, ypreds_svc)
f1

0.01839080459770115

In [None]:
y = data['Hybrid']

In [None]:
svcpredict = svc_pipeline.fit(X, y)

In [None]:
ypreds_svc = svcpredict.predict(X)
ypreds_svc

array([ True,  True,  True, ..., False, False, False])

In [None]:
accuracy = accuracy_score(y, ypreds_svc)
accuracy

0.6329718004338395

In [None]:
confusion_matrix(y, ypreds_svc)

array([[516, 602],
       [244, 943]])

In [None]:
f1 = f1_score(y, ypreds_svc)
f1

0.6903367496339678

### Q2
Which of the six models did the best job distinguishing the target category from the rest? Which did the worst? Does this make intuitive sense?

Of these six models, the best at distinguishing the target category from the rest was the logistic model for Sativa, meanwhile the worst one was SVC for Sativa. This done make some sense for me that one of the non-hybrid strains would be easiest to predict. I expected one of the hybrid models to do worse.

### Q3
Fit and report metrics for OvO versions of the models. That is, for each of the two model types, create three models:

Indica vs. Sativa

Indica vs. Hybrid

Hybrid vs. Sativa

In [None]:
indica_sativa_only = data[data['Type'].isin(['indica', 'sativa'])]

Logsitic Regression

In [None]:
X = indica_sativa_only.drop(['Type', 'Strain', 'Effects', 'Flavor', 'Indica', 'Sativa', 'Hybrid'], axis=1)
y = indica_sativa_only['Indica']

In [None]:
logpredict = log_pipeline.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
ypreds_log = logpredict.predict(X)
ypreds_log

array([False, False,  True, ...,  True,  True,  True])

In [None]:
accuracy = accuracy_score(y, ypreds_log)
accuracy

0.8711985688729875

In [None]:
confusion_matrix(y, ypreds_log)

array([[355,  76],
       [ 68, 619]])

In [None]:
roc_auc_score(y, ypreds_log)

0.8623424080622228

In [None]:
indica_hybrid_only = data[data['Type'].isin(['indica', 'hybrid'])]

In [None]:
X = indica_hybrid_only.drop(['Type', 'Strain', 'Effects', 'Flavor', 'Indica', 'Sativa', 'Hybrid'], axis=1)
y = indica_hybrid_only['Indica']

In [None]:
logpredict = log_pipeline.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
ypreds_log = logpredict.predict(X)
ypreds_log

array([False, False, False, ...,  True,  True,  True])

In [None]:
accuracy = accuracy_score(y, ypreds_log)
accuracy

0.7732123799359658

In [None]:
confusion_matrix(y, ypreds_log)

array([[1001,  186],
       [ 239,  448]])

In [None]:
roc_auc_score(y, ypreds_log)

0.7477065345218518

In [None]:
sativa_hybrid_only = data[data['Type'].isin(['sativa', 'hybrid'])]

In [None]:
X = sativa_hybrid_only.drop(['Type', 'Strain', 'Effects', 'Flavor', 'Indica', 'Sativa', 'Hybrid'], axis=1)
y = sativa_hybrid_only['Sativa']

In [None]:
logpredict = log_pipeline.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
ypreds_log = logpredict.predict(X)
ypreds_log

array([False, False, False, ...,  True, False, False])

In [None]:
accuracy = accuracy_score(y, ypreds_log)
accuracy

0.7669962917181706

In [None]:
confusion_matrix(y, ypreds_log)

array([[1090,   97],
       [ 280,  151]])

In [None]:
roc_auc_score(y, ypreds_log)

0.6343147047383

SVC

In [None]:
X = indica_sativa_only.drop(['Type', 'Strain', 'Effects', 'Flavor', 'Indica', 'Sativa', 'Hybrid'], axis=1)
y = indica_sativa_only['Indica']

In [None]:
svcpredict = svc_pipeline.fit(X, y)

In [None]:
ypreds_svc = svcpredict.predict(X)
ypreds_svc

array([False,  True,  True, ...,  True,  True,  True])

In [None]:
accuracy = accuracy_score(y, ypreds_svc)
accuracy

0.8792486583184258

In [None]:
confusion_matrix(y, ypreds_svc)

array([[351,  80],
       [ 55, 632]])

In [None]:
f1 = f1_score(y, ypreds_svc)
f1

0.9035025017869907

In [None]:
X = indica_hybrid_only.drop(['Type', 'Strain', 'Effects', 'Flavor', 'Indica', 'Sativa', 'Hybrid'], axis=1)
y = indica_hybrid_only['Indica']

In [None]:
svcpredict = svc_pipeline.fit(X, y)

In [None]:
ypreds_svc = svcpredict.predict(X)
ypreds_svc

array([False, False, False, ...,  True,  True,  True])

In [None]:
accuracy = accuracy_score(y, ypreds_svc)
accuracy

0.7572038420490929

In [None]:
confusion_matrix(y, ypreds_svc)

array([[958, 229],
       [226, 461]])

In [None]:
f1 = f1_score(y, ypreds_svc)
f1

0.6695715323166304

In [None]:
X = sativa_hybrid_only.drop(['Type', 'Strain', 'Effects', 'Flavor', 'Indica', 'Sativa', 'Hybrid'], axis=1)
y = sativa_hybrid_only['Sativa']

In [None]:
svcpredict = svc_pipeline.fit(X, y)

In [None]:
ypreds_svc = svcpredict.predict(X)
ypreds_svc

array([False, False, False, ...,  True, False, False])

In [None]:
accuracy = accuracy_score(y, ypreds_svc)
accuracy

0.7713226205191595

In [None]:
confusion_matrix(y, ypreds_svc)

array([[1122,   65],
       [ 305,  126]])

In [None]:
f1 = f1_score(y, ypreds_svc)
f1

0.4051446945337621

### Q4
Which of the six models did the best job distinguishing at differentiating the two groups? Which did the worst? Does this make intuitive sense?

Of the six models, the model that did the best job differentiating between the two groups was SVC for Indica & sativa, meanwhile the worst model was SVC for sativa & hybrid. After additional analysis, this makes sense for both because sativa & indica are the most different, meanwhile sativa & hybrid have more similar features.

### Q5
Suppose you had simply input the full data, with three classes, into the LogisticRegression function. Would this have automatically taken an “OvO” approach or an “OvR” approach?

What about for SVC?

Note: You do not actually have to run code here - you only need to look at sklearn’s documentation to see how these functions handle multiclass input.

If I ran all 3 classes in the logistic regression function, it would have automatically taken the "OVR" approach.

If I ran all 3 classes in SVC, it would have automatically taken the "OVO" approach.