In [8]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv("data/processed_tweets.csv", index_col=0).dropna(subset=["processed_tweet", "unprocessed_tweet"])
df.isnull().sum()

unprocessed_tweet       0
product              5800
emotion                 0
processed_tweet         0
dtype: int64

In [10]:
df_branded = pd.read_csv("data/branded.csv", index_col=0).dropna(subset=["processed_tweet", "unprocessed_tweet"])
df_branded.isnull().sum()

unprocessed_tweet    0
product              0
emotion              0
processed_tweet      0
dtype: int64

In [23]:
df_branded["emotion"].value_counts()

No emotion toward brand or product    4647
Positive emotion                      2965
Negative emotion                       569
I can't tell                           150
Name: emotion, dtype: int64

In [25]:
df_branded = df_branded[~(df_branded["emotion"] == "I can't tell")]
df_branded["emotion"].value_counts()

No emotion toward brand or product    4647
Positive emotion                      2965
Negative emotion                       569
Name: emotion, dtype: int64

In [26]:
len(df_branded)

8181

In [12]:
df["emotion"].value_counts()

No emotion toward brand or product    5387
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: emotion, dtype: int64

In [13]:
df_multi_dropped = df[~(df["emotion"] == "I can't tell")].copy()
df_multi_dropped.shape

(8935, 4)

In [14]:
df_multi_dropped["emotion"].value_counts()

No emotion toward brand or product    5387
Positive emotion                      2978
Negative emotion                       570
Name: emotion, dtype: int64

In [15]:
emotion_map = {
    "No emotion toward brand or product": 2,
    "Positive emotion": 1,
    "Negative emotion": 0,
}

df_multi_dropped["emotion_encoded"] = df_multi_dropped["emotion"].map(emotion_map)
df_multi_dropped["emotion_encoded"].value_counts()

2    5387
1    2978
0     570
Name: emotion_encoded, dtype: int64

In [27]:
df_branded_encoded = df_branded.copy()
df_branded_encoded["emotion_encoded"] = df_branded["emotion"].map(emotion_map)
df_branded_encoded["emotion_encoded"].value_counts()

2    4647
1    2965
0     569
Name: emotion_encoded, dtype: int64

In [28]:
df_branded_encoded.isnull().sum()

unprocessed_tweet    0
product              0
emotion              0
processed_tweet      0
emotion_encoded      0
dtype: int64

In [16]:
from sklearn.model_selection import train_test_split

X = df_multi_dropped["processed_tweet"]
y = df_multi_dropped["emotion_encoded"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6254,), (2681,), (6254,), (2681,))

In [29]:
X_b = df_branded_encoded["processed_tweet"]
y_b = df_branded_encoded["emotion_encoded"]

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_b, y_b, test_size=0.3, random_state=42
)

X_train_b.shape, X_test_b.shape, y_train_b.shape, y_test_b.shape

((5726,), (2455,), (5726,), (2455,))

In [21]:
y_b.isnull().sum()

150

In [10]:
for dataset in [X_train, X_test,  y_train, y_test]:
    print(dataset.isnull().sum())

0
0
0
0


In [31]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

pipe_dt = Pipeline([("vectorizer", TfidfVectorizer(stop_words="english")), ("dt", DecisionTreeClassifier(random_state=42))])
pipe_rf = Pipeline([("vectorizer", TfidfVectorizer(stop_words="english")), ("rf", RandomForestClassifier(random_state=42))])
pipe_knn = Pipeline([("vectorizer", TfidfVectorizer(stop_words="english")), ("knn", KNeighborsClassifier())])

In [10]:
from sklearn.model_selection import cross_validate

scoring_metrics = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted',
    'f1': 'f1_weighted'
}

cv = 5

def cross_val(estimator, X, y, scoring, cv):
    cross_validation = cross_validate(estimator, X, y, cv=cv, scoring=scoring)
    mean_cv = {score_name: np.mean(score) for score_name, score in cross_validation.items()}
    mean_cv_df = pd.DataFrame.from_dict(mean_cv, orient='index', columns=["mean_cv"])
    mean_cv_df.drop(["fit_time", "score_time"], inplace=True)
    return mean_cv_df

In [11]:
print('Decision Tree')
cross_val(pipe_dt, X_train, y_train, scoring=scoring_metrics, cv=cv)

Decision Tree


Unnamed: 0,mean_cv
test_accuracy,0.603517
test_precision,0.596029
test_f1,0.598384


In [12]:
print('Random Forest')
cross_val(pipe_rf, X_train, y_train, scoring=scoring_metrics, cv=cv)

Random Forest


Unnamed: 0,mean_cv
test_accuracy,0.685372
test_precision,0.683964
test_f1,0.65643


In [13]:
print('KNN')
cross_val(pipe_knn, X_train, y_train, scoring=scoring_metrics, cv=5)

KNN


Unnamed: 0,mean_cv
test_accuracy,0.634053
test_precision,0.615238
test_f1,0.610805


In [14]:
params_dt = {
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [10, 20, 30],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__min_df': [0.05, 0.1, 0.15, 0.2],
    'vectorizer__max_df': [0.9, 0.95, 1.0]
}

params_rf = {
    'rf__n_estimators': [50, 100, 200],
    'rf__min_samples_split': [2, 3, 4],
    'rf__max_depth': [10, 20, 30],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__min_df': [0.05, 0.1, 0.15, 0.2],
    'vectorizer__max_df': [0.9, 0.95, 1.0]
}

params_knn = {
    'knn__n_neighbors': [3, 5, 7],
    'knn__weights': ['uniform', 'distance'],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__min_df': [0.05, 0.1, 0.15, 0.2],
    'vectorizer__max_df': [0.9, 0.95, 1.0]
}

In [16]:
from sklearn.model_selection import GridSearchCV

# grid_dt = GridSearchCV(pipe_dt, param_grid=params_dt, cv=cv, verbose=1, n_jobs=6, scoring="accuracy")
# grid_dt.fit(X_train, y_train)

In [17]:
grid_rf = GridSearchCV(pipe_rf, param_grid=params_rf, cv=cv, verbose=1, n_jobs=6, scoring="accuracy")
grid_rf.fit(X_train, y_train)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    5.6s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   22.3s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  1.1min
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  3.3min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:  5.8min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:  8.4min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed: 11.9min
[Parallel(n_jobs=6)]: Done 3240 out of 3240 | elapsed: 12.3min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer',
                                        TfidfVectorizer(stop_words='english')),
                                       ('rf',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=6,
             param_grid={'rf__max_depth': [10, 20, 30],
                         'rf__min_samples_split': [2, 3, 4],
                         'rf__n_estimators': [50, 100, 200],
                         'vectorizer__max_df': [0.9, 0.95, 1.0],
                         'vectorizer__min_df': [0.05, 0.1, 0.15, 0.2],
                         'vectorizer__ngram_range': [(1, 1), (1, 2)]},
             scoring='accuracy', verbose=1)

In [None]:
# grid_knn = GridSearchCV(pipe_knn, param_grid=params_knn, cv=cv, verbose=1, n_jobs=6, scoring="accuracy")
# grid_knn.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    2.5s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   11.9s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:   28.2s
[Parallel(n_jobs=6)]: Done 720 out of 720 | elapsed:   47.1s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer',
                                        TfidfVectorizer(stop_words='english')),
                                       ('knn', KNeighborsClassifier())]),
             n_jobs=6,
             param_grid={'knn__n_neighbors': [3, 5, 7],
                         'knn__weights': ['uniform', 'distance'],
                         'vectorizer__max_df': [0.9, 0.95, 1.0],
                         'vectorizer__min_df': [0.05, 0.1, 0.15, 0.2],
                         'vectorizer__ngram_range': [(1, 1), (1, 2)]},
             scoring='accuracy', verbose=1)

In [None]:
# import pickle

# best_models = {
#     "rf": {"model": grid_rf.best_estimator_, "params": grid_rf.best_params_},
#     "dt": {"model": grid_dt.best_estimator_, "params": grid_dt.best_params_},
#     "knn": {"model": grid_knn.best_estimator_, "params": grid_knn.best_params_}
# }

# with open("models/best_models.pkl", "wb") as f:
#     pickle.dump(best_models, f)

In [None]:
import pickle

with open("models/best_models.pkl", "rb") as f:
    best_models = pickle.load(f)

In [None]:
pipe_rf_best, pipe_rf_best_params = best_models["rf"]["model"], best_models["rf"]["params"]
pipe_dt_best, pipe_dt_best_params = best_models["dt"]["model"], best_models["dt"]["params"]
pipe_knn_best, pipe_knn_best_params = best_models["knn"]["model"], best_models["knn"]["params"]

In [None]:
pipe_rf_best_params

{'rf__max_depth': 10,
 'rf__n_estimators': 200,
 'vectorizer__max_df': 0.9,
 'vectorizer__min_df': 0.05,
 'vectorizer__ngram_range': (1, 1)}

In [None]:
pipe_dt_best_params

{'dt__criterion': 'gini',
 'dt__max_depth': 10,
 'vectorizer__max_df': 1.0,
 'vectorizer__min_df': 0.05,
 'vectorizer__ngram_range': (1, 2)}

In [None]:
pipe_knn_best_params

{'knn__n_neighbors': 7,
 'knn__weights': 'uniform',
 'vectorizer__max_df': 1.0,
 'vectorizer__min_df': 0.05,
 'vectorizer__ngram_range': (1, 2)}

In [18]:
from sklearn.metrics import classification_report

rf_preds = grid_rf.best_estimator_.predict(X_test)
print(classification_report(y_test, rf_preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       189
           1       0.48      0.19      0.28       880
           2       0.63      0.91      0.74      1612

    accuracy                           0.61      2681
   macro avg       0.37      0.37      0.34      2681
weighted avg       0.54      0.61      0.54      2681



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
dt_preds = pipe_dt_best.predict(X_test)
print(classification_report(y_test, dt_preds))

              precision    recall  f1-score   support

           0       0.04      0.01      0.01       189
           1       0.45      0.28      0.35       880
           2       0.63      0.82      0.71      1612

    accuracy                           0.59      2681
   macro avg       0.37      0.37      0.36      2681
weighted avg       0.53      0.59      0.54      2681



In [None]:
knn_preds = pipe_knn_best.predict(X_test)
print(classification_report(y_test, knn_preds))

              precision    recall  f1-score   support

           0       0.25      0.11      0.15       189
           1       0.42      0.31      0.35       880
           2       0.64      0.77      0.70      1612

    accuracy                           0.57      2681
   macro avg       0.44      0.40      0.40      2681
weighted avg       0.54      0.57      0.55      2681



In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_train_b_encoded = encoder.fit_transform(y_train_b.values.reshape(-1, 1))

vectorizer = TfidfVectorizer(stop_words="english")
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
X_train_b_vec = vectorizer.fit_transform(X_train_b)
X_test_b_vec = vectorizer.transform(X_test_b)

# Convert to dense matrix
X_train_vec_dense = X_train_vec.todense()
X_test_vec_dense = X_test_vec.todense()
X_train_b_vec_dense = X_train_b_vec.todense()
X_test_b_vec_dense = X_test_b_vec.todense()

# Model
model = Sequential()
model.add(Dense(10, input_dim=X_train_b_vec_dense.shape[1], activation='relu'))
model.add(Dense(3, activation='softmax'))  # Assuming 3 classes

# Compilation
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

# Training
model.fit(X_train_b_vec_dense, y_train_b_encoded, epochs=3, batch_size=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1da83add610>

In [52]:
from sklearn.metrics import classification_report

predicted_classes = np.argmax(model.predict(X_test_b_vec_dense), axis=1)
print(classification_report(y_test_b, predicted_classes))

              precision    recall  f1-score   support

           0       0.53      0.17      0.26       176
           1       0.61      0.56      0.58       919
           2       0.68      0.78      0.73      1360

    accuracy                           0.65      2455
   macro avg       0.61      0.50      0.52      2455
weighted avg       0.64      0.65      0.64      2455



In [32]:
df['unprocessed_tweet']

Unnamed: 0,unprocessed_tweet,product,emotion,processed_tweet
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,wesley iphone hr tweet riseaustin dead need up...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,jessedee know fludapp awesome ipadiphone app l...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,swonderlin wait ipad also sale sxsw
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,sxsw hope year festival crashy year iphone app...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,sxtxstate great stuff fri sxsw marissa mayer g...
...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,ipad everywhere sxsw link
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,wave buzz rt mention interrupt regularly sched...
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,google zeiger physician never report potential...
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,verizon iphone customer complain time fell bac...


In [60]:
from tensorflow.keras.layers import LSTM

X_train_b_vec_reshaped = np.reshape(X_train_b_vec_dense, (X_train_b_vec_dense.shape[0], X_train_b_vec_dense.shape[1], 1))

lstm = Sequential()
lstm.add(LSTM(100, input_shape=(X_train_b_vec_reshaped.shape[1], 1)))
lstm.add(Dense(3, activation="softmax"))
lstm.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

lstm.fit(X_train_b_vec_reshaped, y_train_b_encoded, epochs=3, batch_size=1)

Epoch 1/3


ValueError: in user code:

    c:\Users\aungs_c08ycsn\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    c:\Users\aungs_c08ycsn\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\keras\engine\training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    c:\Users\aungs_c08ycsn\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    c:\Users\aungs_c08ycsn\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    c:\Users\aungs_c08ycsn\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    c:\Users\aungs_c08ycsn\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\keras\engine\training.py:789 run_step  **
        outputs = model.train_step(data)
    c:\Users\aungs_c08ycsn\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\keras\engine\training.py:747 train_step
        y_pred = self(x, training=True)
    c:\Users\aungs_c08ycsn\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:975 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs,
    c:\Users\aungs_c08ycsn\anaconda3\envs\learn-env\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:176 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer sequential_14 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [1, 6829]
