# Init

In [1]:
import pandas as pd
import numpy as np

#random seed for reproducibility
np.random.seed(67)

# load historical data

In [3]:
historical_data_raw = pd.read_csv('Copy of historical_emotion outputs full v1 enhanced.csv')
historical_data_scored = pd.read_csv('/Users/alexanderliss/Library/CloudStorage/GoogleDrive-aliss@bluestate.co/My Drive/0_AI/MSF/MSF_full_data_with_topics_and_factors - data_with_topics_and_factors.csv')

In [4]:
training_data = historical_data_raw.copy()

In [5]:
training_data['top_factor_label'] = historical_data_scored.top_factor_label

In [6]:
training_data.columns

Index(['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
       'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
       'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
       'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
       'relief', 'remorse', 'sadness', 'surprise', 'neutral', 'top_value',
       'top_label', 'top_factor_label'],
      dtype='object')

In [7]:
training_data.drop(columns=['top_value','top_label'], inplace=True)

# Load recent data

In [9]:
new_data = pd.read_csv('Copy of Q4 2022 emotion outputs full v1_update.csv')

In [10]:
new_data.columns

Index(['Unnamed: 0', 'admiration', 'amusement', 'anger', 'annoyance',
       'approval', 'caring', 'confusion', 'curiosity', 'desire',
       'disappointment', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
       'sadness', 'surprise', 'neutral', 'emo value_original',
       'emo_class_original'],
      dtype='object')

In [11]:
new_data.drop(columns=['Unnamed: 0','emo value_original','emo_class_original'], inplace=True)

# Load SKlearn model

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline

## test train split and check accuracy of model with historical data

In [13]:
X = training_data.drop(columns=['top_factor_label'])
y = training_data.top_factor_label

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
clf = make_pipeline(StandardScaler(),
                    SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))

In [16]:
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [17]:
y_pred = clf.predict(X_test)

print('accuracy %s' % accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred,target_names=y.unique()))

accuracy 0.9574678536102869
               precision    recall  f1-score   support

      Neutral       0.99      0.90      0.94        99
Compassionate       1.00      1.00      1.00        84
      Excited       0.95      0.88      0.92       111
       Urgent       0.95      0.84      0.89        50
    Assertive       1.00      0.98      0.99        46
       Caring       0.97      1.00      0.98       146
  Inquisitive       0.94      0.99      0.96       438
     Grateful       1.00      0.78      0.88        37

     accuracy                           0.96      1011
    macro avg       0.97      0.92      0.95      1011
 weighted avg       0.96      0.96      0.96      1011



### Restitching the data files together here for export

In [20]:
historical_data_with_labels_accurate_jan_10 = pd.concat([historical_data_scored, historical_data_raw], axis=1)
historical_data_with_labels_accurate_jan_10.to_csv('historical_data_with_labels_accurate_jan_10.csv', index=False)

## apply to new data

In [22]:
new_label_predictions = clf.predict(new_data)

In [23]:
pd.Series(new_label_predictions).value_counts()

Neutral          298
Inquisitive       98
Compassionate     75
Caring            40
Excited           15
Assertive         13
Grateful          12
Urgent             8
dtype: int64

In [24]:
new_data['top_factor_label'] = new_label_predictions

# stitching the combined files in here

In [25]:
new_data.columns

Index(['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
       'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
       'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
       'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
       'relief', 'remorse', 'sadness', 'surprise', 'neutral',
       'top_factor_label'],
      dtype='object')

In [27]:
new_data.top_factor_label.head(25)

0           Neutral
1           Neutral
2           Neutral
3           Neutral
4           Neutral
5           Neutral
6           Neutral
7           Neutral
8           Neutral
9           Neutral
10          Neutral
11          Neutral
12          Neutral
13          Neutral
14          Neutral
15          Neutral
16          Neutral
17          Neutral
18        Assertive
19    Compassionate
20    Compassionate
21    Compassionate
22    Compassionate
23    Compassionate
24    Compassionate
Name: top_factor_label, dtype: object

In [28]:
new_data_bq_export = pd.read_csv('q4 2022 data raw.csv')

In [32]:
q4_2022_data_with_labels_accurate_jan_10 = pd.concat([new_data_bq_export, new_data], axis=1)
q4_2022_data_with_labels_accurate_jan_10.to_csv('q4_2022_data_with_labels_accurate_jan_10.csv', index=False)

## subsetting the files based on columns and merging

In [33]:
q4_2022_data_with_labels_accurate_jan_10.columns

Index(['send_dt', 'send_time', 'send_number', 'campaign', 'email_name',
       'category', 'text', 'emails_sent', 'emails_delivered', 'undeliverable',
       'total_clicks', 'unique_clicks', 'unique_opens', 'unsubscribes',
       'unique_complaints', 'total_complaints', 'gifts', 'revenue', 'Audience',
       'send_group', 'Open_Rate_nw', 'Click_Rate_nw', 'Donation_Rate_nw',
       'revenue_1k_new', 'month', 'polarity_score', 'subjectivity_score',
       'processed_text', 'admiration', 'amusement', 'anger', 'annoyance',
       'approval', 'caring', 'confusion', 'curiosity', 'desire',
       'disappointment', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
       'sadness', 'surprise', 'neutral', 'top_factor_label'],
      dtype='object')

In [35]:
historical_data_with_labels_accurate_jan_10_trimmed_columns = historical_data_with_labels_accurate_jan_10[list(q4_2022_data_with_labels_accurate_jan_10.columns)]

In [37]:
combined_data_with_labels_accurate_jan_10 = pd.concat([q4_2022_data_with_labels_accurate_jan_10, historical_data_with_labels_accurate_jan_10_trimmed_columns])
combined_data_with_labels_accurate_jan_10.to_csv('combined_data_with_labels_accurate_jan_10.csv', index=False)

In [45]:
combined_data_with_labels_accurate_jan_10.top_factor_label.value_counts()

Neutral          1607
Inquisitive       512
Compassionate     407
Assertive         306
Caring            297
Excited           184
Grateful          181
Urgent            126
Name: top_factor_label, dtype: int64

# testing the labels are okee-dokee

In [38]:
quant_feature_cols = [
    'admiration', 'amusement', 'anger', 'annoyance',
       'approval', 'caring', 'confusion', 'curiosity', 'desire',
       'disappointment', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
       'sadness', 'surprise', 'neutral']

In [39]:
X = combined_data_with_labels_accurate_jan_10[quant_feature_cols]
y = combined_data_with_labels_accurate_jan_10.top_factor_label

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [41]:
clf = make_pipeline(StandardScaler(),
                    SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))

In [42]:
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [43]:
y_pred = clf.predict(X_test)

print('accuracy %s' % accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred,target_names=y.unique()))

accuracy 0.9589958158995816
               precision    recall  f1-score   support

      Neutral       0.97      0.88      0.92        98
    Assertive       1.00      0.96      0.98        89
Compassionate       0.92      0.95      0.94       125
  Inquisitive       1.00      0.78      0.88        60
     Grateful       1.00      1.00      1.00        52
       Caring       0.96      0.99      0.97       178
      Excited       0.95      0.99      0.97       557
       Urgent       1.00      0.89      0.94        36

     accuracy                           0.96      1195
    macro avg       0.97      0.93      0.95      1195
 weighted avg       0.96      0.96      0.96      1195



# build a combined model with old and new preds and pickle it

In [44]:
#https://medium.com/@maziarizadi/pickle-your-model-in-python-2bbe7dba2bbb

import pickle

pickle.dump(clf, open('subject_line_tonality_classifier_jan10.pkl','wb'))

## exporting new master date file

In [46]:
combined_data_with_labels_accurate_jan_10.to_csv('new_master_file_through_EOY_2022.csv', index=False)

# one last sanity check - reloading the new new data file the running the model against it for accuracy

In [52]:
data = pd.read_csv('testing-new_master_file_through_EOY_2022.csv')

In [57]:
pickled_model_new = pickle.load(open('subject_line_tonality_classifier_jan10.pkl', 'rb'))

In [53]:
X = data[quant_feature_cols]
y = data.top_factor_label

In [54]:
y.value_counts()

Neutral          1555
Inquisitive       496
Compassionate     402
Assertive         345
Caring            301
Grateful          185
Excited           184
Urgent            152
Name: top_factor_label, dtype: int64

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [58]:
y_pred = pickled_model_new.predict(X_test)

print('accuracy %s' % accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred,target_names=y.unique()))

accuracy 0.9322175732217574
               precision    recall  f1-score   support

       Caring       0.97      0.79      0.87       109
Compassionate       0.98      0.94      0.96       115
      Neutral       0.90      0.94      0.92       129
       Urgent       0.98      0.80      0.88        54
  Inquisitive       0.97      0.94      0.95        63
    Assertive       0.93      0.96      0.95       170
      Excited       0.92      0.99      0.95       501
     Grateful       1.00      0.69      0.81        54

     accuracy                           0.93      1195
    macro avg       0.95      0.88      0.91      1195
 weighted avg       0.94      0.93      0.93      1195



# appendix

In [19]:
y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=y.unique()))

accuracy 0.8822947576656776
               precision    recall  f1-score   support

      Neutral       0.96      0.77      0.85        99
Compassionate       0.89      0.80      0.84        84
      Excited       0.97      0.59      0.74       111
       Urgent       0.90      0.92      0.91        50
    Assertive       0.94      0.98      0.96        46
       Caring       0.98      0.91      0.94       146
  Inquisitive       0.82      0.99      0.90       438
     Grateful       1.00      0.65      0.79        37

     accuracy                           0.88      1011
    macro avg       0.93      0.83      0.87      1011
 weighted avg       0.90      0.88      0.88      1011



In [26]:
y_pred = clf.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=y.unique()))

accuracy 0.9574678536102869
               precision    recall  f1-score   support

      Neutral       0.99      0.90      0.94        99
Compassionate       1.00      1.00      1.00        84
      Excited       0.95      0.88      0.92       111
       Urgent       0.95      0.84      0.89        50
    Assertive       1.00      0.98      0.99        46
       Caring       0.97      1.00      0.98       146
  Inquisitive       0.94      0.99      0.96       438
     Grateful       1.00      0.78      0.88        37

     accuracy                           0.96      1011
    macro avg       0.97      0.92      0.95      1011
 weighted avg       0.96      0.96      0.96      1011



In [39]:
clf.fit(X, y)

expected = y
predicted = clf.predict(X)

print('accuracy %s' % accuracy_score(expected, predicted))
print(classification_report(expected, predicted, target_names=y.unique()))

accuracy 0.9496896439072199
               precision    recall  f1-score   support

      Neutral       0.91      0.87      0.89       293
Compassionate       1.00      0.96      0.98       257
      Excited       0.94      0.93      0.94       332
       Urgent       1.00      0.76      0.86       169
    Assertive       1.00      0.98      0.99       169
       Caring       1.00      0.96      0.98       414
  Inquisitive       0.92      1.00      0.96      1309
     Grateful       1.00      0.79      0.88       118

     accuracy                           0.95      3061
    macro avg       0.97      0.91      0.94      3061
 weighted avg       0.95      0.95      0.95      3061



## Extra credit -- evaluating different classifiers

In [37]:
# https://www.scikit-yb.org/en/latest/tutorial.html
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier

models = [
    SVC(gamma='auto'), NuSVC(gamma='auto'), LinearSVC(),
    SGDClassifier(max_iter=100, tol=1e-3), KNeighborsClassifier(),
    LogisticRegression(solver='lbfgs'), LogisticRegressionCV(cv=3),
    BaggingClassifier(), ExtraTreesClassifier(n_estimators=300),
    RandomForestClassifier(n_estimators=300)
        ]

def score_model(X, y, estimator, **kwargs):
    """
    Test various estimators.
    """
    y = LabelEncoder().fit_transform(y)
    model = Pipeline([
        ('one_hot_encoder', OneHotEncoder()),
        ('estimator', estimator)
        ])
    
    # Instantiate the classification model and visualizer
    model.fit(X, y, **kwargs)

    expected  = y
    predicted = model.predict(X)
    
    #print('accuracy %s' % accuracy_score(y_test, y_pred))

    # Compute and return F1 (harmonic mean of precision and recall)
    print("{}: {}".format(estimator.__class__.__name__, accuracy_score(expected, predicted)))#f1_score(expected, predicted)))


In [38]:
for model in models:
    score_model(X, y, model)

SVC: 0.42763802678863116


ValueError: b'specified nu is infeasible'