In [1]:
# Import Necessary Packages
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

import os
import sys
sys.path.append(os.path.join(os.path.abspath(".."), (".."), "code"))
#from plotting_functions import *
#from utils import *

import altair as alt
from vega_datasets import data
# Simplify working with large datasets in Altair
alt.data_transformers.enable('vegafusion')

pd.set_option("display.max_colwidth", 200)

# Logistic Regresion on Multiple Classes in Target

In [2]:
df = pd.read_csv("data/cleaned_hm.csv", index_col=0)
sample_df = df.dropna()
sample_df.head()

Unnamed: 0_level_0,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
hmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
27676,206,24h,We had a serious talk with some friends of ours who have been flaky lately. They understood and we had a good evening hanging out.,We had a serious talk with some friends of ours who have been flaky lately. They understood and we had a good evening hanging out.,True,2,bonding,bonding
27678,45,24h,I meditated last night.,I meditated last night.,True,1,leisure,leisure
27697,498,24h,My grandmother start to walk from the bed after a long time.,My grandmother start to walk from the bed after a long time.,True,1,affection,affection
27705,5732,24h,I picked my daughter up from the airport and we have a fun and good conversation on the way home.,I picked my daughter up from the airport and we have a fun and good conversation on the way home.,True,1,bonding,affection
27715,2272,24h,when i received flowers from my best friend,when i received flowers from my best friend,True,1,bonding,bonding


In [3]:
sample_df = sample_df.rename(
    columns={"cleaned_hm": "moment", "ground_truth_category": "target"}
)
sample_df

Unnamed: 0_level_0,wid,reflection_period,original_hm,moment,modified,num_sentence,target,predicted_category
hmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
27676,206,24h,We had a serious talk with some friends of ours who have been flaky lately. They understood and we had a good evening hanging out.,We had a serious talk with some friends of ours who have been flaky lately. They understood and we had a good evening hanging out.,True,2,bonding,bonding
27678,45,24h,I meditated last night.,I meditated last night.,True,1,leisure,leisure
27697,498,24h,My grandmother start to walk from the bed after a long time.,My grandmother start to walk from the bed after a long time.,True,1,affection,affection
27705,5732,24h,I picked my daughter up from the airport and we have a fun and good conversation on the way home.,I picked my daughter up from the airport and we have a fun and good conversation on the way home.,True,1,bonding,affection
27715,2272,24h,when i received flowers from my best friend,when i received flowers from my best friend,True,1,bonding,bonding
...,...,...,...,...,...,...,...,...
128726,566,24h,yesterday chat with my brother in video call its was superb chat enjyed well.,yesterday chat with my brother in video call its was superb chat enjoyed well.,False,1,affection,affection
128736,1580,24h,learning how to better hunt for hits on amazon better,learning how to better hunt for hits on amazon better,True,1,achievement,achievement
128746,248,24h,I woke up in the middle of the night and realizing that I still had another four hours of sleep left before having to get up.,I woke up in the middle of the night and realizing that I still had another four hours of sleep left before having to get up.,True,1,enjoy_the_moment,enjoy_the_moment
128758,4428,24h,Yesterday my relations came to my house. That time am very happy to saw them,Yesterday my relations came to my house. That time am very happy to saw them,True,2,affection,affection


In [4]:
train_df, test_df = train_test_split(sample_df, test_size=0.3, random_state=123)
X_train, y_train = train_df["moment"], train_df["target"]
X_test, y_test = test_df["moment"], test_df["target"]

In [5]:
# We have 7 targets and their proportions in train dataset
train_df["target"].value_counts(normalize=True)

target
affection           0.342571
achievement         0.300799
bonding             0.127238
enjoy_the_moment    0.105694
leisure             0.090927
nature              0.018307
exercise            0.014463
Name: proportion, dtype: float64

# Compare Multiple Models at Once

In [6]:
models = {
    "dummy": DummyClassifier(random_state = 123),
    "Decision Tree": DecisionTreeClassifier(random_state = 123),
    "KNN": KNeighborsClassifier(),
    "RBF SVM": SVC(random_state = 123),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter = 2000, random_state = 123),
}

In [7]:
def build_pipeline(model):
    return make_pipeline(CountVectorizer(stop_words="english"), model)

In [8]:
# Code adapted from class demos and lab 2 provided function
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.4f (+/- %0.4f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [9]:
results_dict = {}

for n, m in models.items():
    pipe = build_pipeline(m)
    results_dict[n] = mean_std_cross_val_scores(
        pipe, X_train, y_train, cv = 5, return_train_score = True
    )

results_df = pd.DataFrame(results_dict).T
results_df

Unnamed: 0,fit_time,score_time,test_score,train_score
dummy,0.0375 (+/- 0.0017),0.0086 (+/- 0.0002),0.3426 (+/- 0.0003),0.3426 (+/- 0.0001)
Decision Tree,0.2537 (+/- 0.0146),0.0092 (+/- 0.0003),0.7616 (+/- 0.0083),0.9972 (+/- 0.0004)
KNN,0.0405 (+/- 0.0031),0.1847 (+/- 0.0281),0.6593 (+/- 0.0164),0.7599 (+/- 0.0055)
RBF SVM,3.4132 (+/- 0.0245),0.5459 (+/- 0.1332),0.8010 (+/- 0.0134),0.9302 (+/- 0.0013)
Naive Bayes,0.0440 (+/- 0.0006),0.0088 (+/- 0.0003),0.7595 (+/- 0.0074),0.8605 (+/- 0.0020)
Logistic Regression,0.2795 (+/- 0.0317),0.0098 (+/- 0.0014),0.8240 (+/- 0.0037),0.9588 (+/- 0.0010)


# Logistic Regression Hyperparameter Optimization

In [10]:
# Code adapted from class Lecture 8
from random import randint
from scipy.stats import loguniform
vec = CountVectorizer(stop_words="english")
bow = vec.fit_transform(X_train)

vocab = vec.get_feature_names_out()

param_dist = {
    "countvectorizer__max_features": np.arange(900, 5001, 50), #randint(10, len(vocab)),
    "logisticregression__C": 10.0 ** np.arange(-3, 6, 1)
}

log_pipe = make_pipeline(CountVectorizer(stop_words = "english"), LogisticRegression(max_iter = 2000))

In [11]:
# Top 10 RandomizedSearchCV() Results
random_search = RandomizedSearchCV(log_pipe, param_dist, n_iter = 200, n_jobs = -1, return_train_score = True)
random_search.fit(X_train, y_train)
pd.DataFrame(random_search.cv_results_)[
    [
        "mean_test_score",
        "mean_train_score",        
        "param_logisticregression__C",
        "param_countvectorizer__max_features",
        "mean_fit_time",
        "rank_test_score",
    ]
].set_index("rank_test_score").sort_index().head(10)

Unnamed: 0_level_0,mean_test_score,mean_train_score,param_logisticregression__C,param_countvectorizer__max_features,mean_fit_time
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.822595,0.94819,1.0,3950,0.410103
2,0.822595,0.943891,1.0,3300,0.396409
3,0.822493,0.942778,1.0,3150,0.414395
4,0.822392,0.950769,1.0,4650,0.423442
4,0.822392,0.943006,1.0,3200,0.352313
6,0.822291,0.949353,1.0,4300,0.407199
7,0.822291,0.949429,1.0,4250,0.458037
8,0.822291,0.942045,1.0,3050,0.321581
9,0.822089,0.946091,1.0,3550,0.4079
10,0.821988,0.948645,1.0,4100,0.437394


In [12]:
# your random search, highlighting their training scores, CV scores, hyperparameter configuratiions, and fit times.
pd.DataFrame(random_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__C,param_countvectorizer__max_features,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.198374,0.009217,0.020327,0.009836,0.001,4900,"{'logisticregression__C': 0.001, 'countvectorizer__max_features': 4900}",0.514661,0.523256,0.513404,...,0.517143,0.003425,183,0.522569,0.517385,0.523641,0.523135,0.521871,0.521720,0.002246
1,0.512480,0.046012,0.017005,0.005646,10000.000,2500,"{'logisticregression__C': 10000.0, 'countvectorizer__max_features': 2500}",0.769970,0.779575,0.770865,...,0.772226,0.004881,127,0.994942,0.993299,0.994943,0.993805,0.993426,0.994083,0.000721
2,0.126689,0.014999,0.019316,0.007512,0.001,2650,"{'logisticregression__C': 0.001, 'countvectorizer__max_features': 2650}",0.514661,0.523256,0.512898,...,0.516941,0.003519,188,0.522190,0.516880,0.523515,0.522882,0.521492,0.521392,0.002355
3,0.328026,0.017745,0.024657,0.002959,1.000,2450,"{'logisticregression__C': 1.0, 'countvectorizer__max_features': 2450}",0.824065,0.826593,0.819423,...,0.820875,0.003845,16,0.935643,0.936022,0.935019,0.934134,0.937421,0.935648,0.001093
4,0.137874,0.011578,0.038949,0.038641,0.001,1800,"{'logisticregression__C': 0.001, 'countvectorizer__max_features': 1800}",0.514661,0.522245,0.510875,...,0.516132,0.003684,196,0.521937,0.516247,0.522503,0.521492,0.520354,0.520507,0.002244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.223888,0.016376,0.020230,0.005117,0.100,2750,"{'logisticregression__C': 0.1, 'countvectorizer__max_features': 2750}",0.796764,0.795248,0.784522,...,0.789824,0.005233,78,0.838918,0.840561,0.840329,0.839191,0.840834,0.839967,0.000767
196,0.603480,0.069991,0.019486,0.002325,10000.000,3400,"{'logisticregression__C': 10000.0, 'countvectorizer__max_features': 3400}",0.777048,0.787664,0.783510,...,0.783150,0.005023,109,0.995954,0.995069,0.995702,0.996081,0.995196,0.995600,0.000403
197,0.296015,0.015902,0.020706,0.003255,1.000,1150,"{'logisticregression__C': 1.0, 'countvectorizer__max_features': 1150}",0.816987,0.814459,0.817906,...,0.815718,0.001520,24,0.905551,0.905930,0.904298,0.905057,0.909102,0.905988,0.001650
198,0.252668,0.027431,0.019881,0.002291,0.100,3950,"{'logisticregression__C': 0.1, 'countvectorizer__max_features': 3950}",0.798787,0.795248,0.787557,...,0.790936,0.005261,62,0.842458,0.843722,0.843869,0.841846,0.845006,0.843380,0.001114


In [13]:
best_model = random_search.best_estimator_
best_model

0,1,2
,steps,"[('countvectorizer', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,np.float64(1.0)
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000


# Interpret Logistic Regression Results on Features, Test Score

In [14]:
# Get feature names
feature_names = best_model.named_steps['countvectorizer'].get_feature_names_out()

# Get coefficients 
coeffs = best_model.named_steps["logisticregression"].coef_
classes = best_model.named_steps["logisticregression"].classes_

In [15]:
def top_words_for_class(class_label, top_n = 5):
    """
    Returns top n (default n = 5) positive and negative words for given class_label

    Parameters
    ----------
    class_label : str
        The desired target class to check
    top_n : int
        Number of top positive and negative coefficients, default is 5

    Returns
    ----------
        top positive and top negative dataframes with index, word, and coefficient related to the given class label
    """
    class_index = list(classes).index(class_label)
    class_coefs = coeffs[class_index]

    # Create DataFrame of words and their coefficients
    word_coeff_df = pd.DataFrame({
        "word": feature_names,
        "coefficient": class_coefs
    })

    # Sort and extract top positive and negative words
    top_positive = word_coeff_df.sort_values("coefficient", ascending = False).head(top_n)
    top_negative = word_coeff_df.sort_values("coefficient").head(top_n)

    return top_positive, top_negative

In [16]:
affection_pos, affection_neg = top_words_for_class("affection")
exercise_pos, exercise_neg = top_words_for_class("exercise")

print("Top 5 positive words for 'affection':")
print(affection_pos)

print("\nTop 5 negative words for 'affection':")
print(affection_neg)

print("\nTop 5 positive words for 'exercise':")
print(exercise_pos)

print("\nTop 5 negative words for 'exercise':")
print(exercise_neg)

Top 5 positive words for 'affection':
          word  coefficient
1703   husband     4.759748
3866      wife     4.461252
3267       son     4.412922
997   daughter     4.304264
1314    family     4.066285

Top 5 negative words for 'affection':
               word  coefficient
3698  unforgettable    -1.595852
415         bicycle    -1.177781
550        business    -1.129645
1249           exam    -1.122188
2314      neighbors    -1.039083

Top 5 positive words for 'exercise':
          word  coefficient
1582       gym     3.680555
3904   workout     3.452050
1261  exercise     2.431691
3033       run     2.323682
3937      yoga     2.091622

Top 5 negative words for 'exercise':
       word  coefficient
1523    got    -0.819534
1850   just    -0.745669
575    came    -0.742370
1667   home    -0.738639
2258  movie    -0.615420


In [17]:
positive5_affection = affection_pos["word"].tolist()  # list
negative5_affection = affection_neg["word"].tolist()  # list
positive5_affection

['husband', 'wife', 'son', 'daughter', 'family']

In [18]:
negative5_affection

['unforgettable', 'bicycle', 'business', 'exam', 'neighbors']

In [19]:
positive5_exercise = exercise_pos["word"].tolist()  # list
negative5_exercise = exercise_neg["word"].tolist()  # list
positive5_exercise

['gym', 'workout', 'exercise', 'run', 'yoga']

In [20]:
negative5_exercise

['got', 'just', 'came', 'home', 'movie']

### Final Evaluation on Test Data

In [21]:
random_search_best_score = random_search.best_score_
train_score = random_search.score(X_train, y_train)
test_score = random_search.score(X_test, y_test)

In [22]:
print("Random Search CrossValidate Best Score: ", random_search_best_score)
print("Train Score using Best Model: ", train_score)
print("Test Score using Best Model: ", test_score)

Random Search CrossValidate Best Score:  0.8225948253243953
Train Score using Best Model:  0.945079397188227
Test Score using Best Model:  0.816658801321378


### Evaluation using Probability Scores

In [23]:
highest_probs = best_model.predict_proba(X_test)

class_labels = best_model.named_steps["logisticregression"].classes_
achievement_index = list(class_labels).index("achievement")
nature_index = list(class_labels).index("nature")

achievement_best_idx = np.argmax(highest_probs[:, achievement_index])
nature_best_idx = np.argmax(highest_probs[:, nature_index])

In [24]:
achievement_prob = highest_probs[achievement_best_idx, achievement_index]  # numpy.float64
achievement_msg = X_test.iloc[achievement_best_idx]  # str

In [25]:
nature_prob = highest_probs[nature_best_idx, nature_index]  # numpy.float64
nature_msg = X_test.iloc[nature_best_idx]  # str

In [26]:
print("Most confident 'achievement' moment:")
print(f"Message: {achievement_msg}")
print(f"Predicted probability: {achievement_prob}\n")

print("Most confident 'nature' moment:")
print(f"Message: {nature_msg}")
print(f"Predicted probability: {nature_prob}")

Most confident 'achievement' moment:
Message: An event that made me happy in the past 24 hours was when I was able to get a medical bill written off completely. The hospital had not followed proper procedures to get insurance approval for the treatment, which was a violation of their contract with the insurance company. Because of this the insurance company denied the claim and stated that I did not have to pay the bill. After spending 2.5 hours on the phone with the hospital and insurance company today it was concluded that the insurance company was correct and I did not have to pay the money.
Predicted probability: 0.9999999602176473

Most confident 'nature' moment:
Message: The weather was so beautiful that I was able to go outside and plant flowers and enjoy the sun without melting. 
Predicted probability: 0.9937475186719976


# Test Sample Moments

In [27]:
test_moments = [
    "I just finished my last assignment!",
    "On the weekend, I spent some quality time with my best friend.",
    "Collaborating with peers and teaching team members is what makes MDS enjoyable!!",
    "I went for a hike in the forest.",
    "I did yoga this morning.",
    "I am still breathing and I am alive!",
]

In [28]:
predicted_labels = best_model.predict(test_moments)
predicted_probs = best_model.predict_proba(test_moments)
class_labels = best_model.named_steps["logisticregression"].classes_

for i, moment in enumerate(test_moments):
    print(f"\nMoment: {moment}")
    print(f"Predicted class: {predicted_labels[i]}")
    print("Class probabilities:")
    for j in range(len(class_labels)):
        cls = class_labels[j]
        prob = predicted_probs[i][j]
        print(f"  {cls}: {prob:.4f}")


Moment: I just finished my last assignment!
Predicted class: achievement
Class probabilities:
  achievement: 0.9426
  affection: 0.0195
  bonding: 0.0026
  enjoy_the_moment: 0.0171
  exercise: 0.0023
  leisure: 0.0141
  nature: 0.0018

Moment: On the weekend, I spent some quality time with my best friend.
Predicted class: bonding
Class probabilities:
  achievement: 0.0011
  affection: 0.0249
  bonding: 0.9693
  enjoy_the_moment: 0.0010
  exercise: 0.0006
  leisure: 0.0027
  nature: 0.0003

Moment: Collaborating with peers and teaching team members is what makes MDS enjoyable!!
Predicted class: achievement
Class probabilities:
  achievement: 0.3647
  affection: 0.0951
  bonding: 0.2831
  enjoy_the_moment: 0.1511
  exercise: 0.0084
  leisure: 0.0787
  nature: 0.0190

Moment: I went for a hike in the forest.
Predicted class: nature
Class probabilities:
  achievement: 0.1306
  affection: 0.1075
  bonding: 0.0521
  enjoy_the_moment: 0.0635
  exercise: 0.1067
  leisure: 0.2603
  nature: 0.2