In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import preprocessor as p

In [2]:
fp = "../data/REVISION DATASET_b.xlsx"
df = pd.read_excel(fp,sheet_name="data")

In [3]:
df.head()

Unnamed: 0,fullname,is_retweet,likes,replies,retweets,text,expresses a pain point,timestamp,timestamp_epochs,tweet_id,tweet_url,user_id,username,BRAND,Type of Pain,Subjectivity,Second category,Third category,Fourth category,Dataset
0,Emmanuel Olabode,0,1,1,1,@ifemeetstech your pr team can help bridge the...,y,42176,1434841837,612397168788406016,/olabodeEO/status/612397168788406272,1955234486,olabodeEO,gap,Operational issues,,,,,Original
1,Alviniecððâ¨,0,0,0,0,mcdonalds really bein missing uhp people food ...,y,2014-09-01 23:25:17,1409613917,506583600599662592,/ohhamazing/status/506583600599662592,2723652417,ohhamazing,mcdonalds,Product feature or quality,,,,,Original
2,Bobby,0,0,0,0,"if they thought that little of him, why was he...",y,43720,1568242832,1171921495309910016,/Bobbythegreat/status/1171921495309914112,33740752,Bobbythegreat,gap,Product feature or quality,,,,,Original
3,Elgen Bodenstien,0,0,0,0,when towns have locally owned business capita...,y,2019-09-05 23:34:22,1567726462,1169755684122086912,/bodenstien/status/1169755684122087424,1167585352464424960,bodenstien,walmart,Company's image,,,,,Original
4,Robyn,0,3,1,1,@arma_vancouver health information in records ...,y,2019-01-23 22:43:08,1548283388,1088205518886190976,/RobynCBird/status/1088205518886191104,2206030555,RobynCBird,fitbit,Company's image,,,,,Original


In [4]:
df = df[["text","expresses a pain point"]]

In [5]:
encoded_label_dict = {"n" : 0, "y" : 1}
def encode_label(x):
    return encoded_label_dict.get(x,-1)

In [6]:
df["target"] = df["expresses a pain point"].apply(lambda x: encode_label(x))

In [7]:
#p.clean('Preprocessor is #awesome 👍 https://github.com/s/preprocessor')

In [8]:
def preprocess(txt):
    return p.clean(txt)

In [9]:
df["clean_text"] = df["text"].apply(lambda x: preprocess(x))

In [10]:
df.head()

Unnamed: 0,text,expresses a pain point,target,clean_text
0,@ifemeetstech your pr team can help bridge the...,y,1,your pr team can help bridge the communication...
1,mcdonalds really bein missing uhp people food ...,y,1,mcdonalds really bein missing uhp people food ...
2,"if they thought that little of him, why was he...",y,1,"if they thought that little of him, why was he..."
3,when towns have locally owned business capita...,y,1,when towns have locally owned business capital...
4,@arma_vancouver health information in records ...,y,1,health information in records means vulnerabil...


In [11]:
df.target.value_counts()

0    3348
1    1252
Name: target, dtype: int64

In [12]:
pain_keywords_fp = "../data/pain point keywords.txt"
pain_keywords = open(pain_keywords_fp).read().strip().split("\n")
pain_keywords

['need',
 'want',
 'wish',
 'feature',
 'ask',
 'would like',
 'improve',
 'idea',
 'upgrade',
 'support',
 'problem',
 'issue',
 'help',
 'fix',
 'complain',
 'fail',
 'sucks',
 'hope',
 'not good',
 'did not',
 'missing',
 'should',
 'i hate',
 'bad',
 'bothers',
 'bothering',
 'frustrates',
 'frustrating',
 'failure']

In [13]:
def is_keyword_in_text(txt, pain_kwds):
   return any([True if kw in txt else False for kw in pain_kwds])

In [14]:
def is_spaced_keyword_in_text(txt, pain_kwds):
   return any([True if f" {kw} " in f" {txt} " else False for kw in pain_kwds])

In [38]:
analyzer = SentimentIntensityAnalyzer()
txt = "@ifemeetstech your pr team can help bridge the communication gap between tech guys and students from other department"
txt = """hey @macys stop trying to lure young or uneducated folks into opening a credit card to save $4 on a $20 purchase.  what is frustrating is the sales rep uses an attitude as if we are stupid when we decline.  no we are smart and rather open one if we spent $2k for example. pathetic"""
ps = analyzer.polarity_scores(txt)
print(ps)

{'neg': 0.206, 'neu': 0.672, 'pos': 0.122, 'compound': -0.7096}


In [39]:
analyzer = SentimentIntensityAnalyzer()
def is_sentiment_negative(txt, analyzer):
    ps = analyzer.polarity_scores(txt)
    if ps["compound"]<0:
        return True
    return False

In [40]:
v = "Not bad at all"
is_sentiment_negative(v,analyzer)

False

In [17]:
#is_keyword_in_text("some water",pain_keywords)
#is_spaced_keyword_in_text("need some water",pain_keywords)

#### Train/Valid/Test Split

In [18]:
train, valid_test = train_test_split(df, test_size=0.2, shuffle=True, stratify=None, random_state=2021)
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, stratify=None, random_state=2021)

#### Adding Inferece functions

In [49]:
def predict(data,text_col,pain_kwds):
    preds_kw_based = data[text_col].apply(lambda x: int(is_keyword_in_text(x,pain_kwds))).values
    preds_sentiment_based = data[text_col].apply(lambda x: int(is_sentiment_negative(x,analyzer))).values
    return preds_kw_based & preds_sentiment_based

In [50]:
def predict_with_space(data,text_col,pain_kwds):
    preds_kw_based = data[text_col].apply(lambda x: int(is_spaced_keyword_in_text(x,pain_kwds))).values
    preds_sentiment_based = data[text_col].apply(lambda x: int(is_sentiment_negative(x,analyzer))).values
    return preds_kw_based & preds_sentiment_based

#### Exact Match AND Sentiment without preprocessing

In [51]:
train_preds = predict(train,"text",pain_keywords)

In [52]:
valid_preds = predict(valid,"text",pain_keywords)

In [53]:
test_preds = predict(test,"text",pain_keywords)

In [54]:
from sklearn.metrics import confusion_matrix
y_true = test.target.values
y_pred = test_preds
confusion_matrix(y_true,y_pred)

array([[228, 104],
       [ 75,  53]])

In [55]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
acc = accuracy_score(y_true,y_pred)
precision = precision_score(y_true,y_pred)
recall = recall_score(y_true,y_pred)

In [56]:
print(f"Accuracy: {acc*100}; Precision:{precision*100}; Recall:{recall*100}")

Accuracy: 61.08695652173913; Precision:33.75796178343949; Recall:41.40625


In [57]:
print(classification_report(y_true, y_pred, target_names=["N","Y"]))

              precision    recall  f1-score   support

           N       0.75      0.69      0.72       332
           Y       0.34      0.41      0.37       128

    accuracy                           0.61       460
   macro avg       0.55      0.55      0.55       460
weighted avg       0.64      0.61      0.62       460



#### Exact Match AND Sentiment after preprocessing

In [58]:
train_preds = predict(train,"clean_text",pain_keywords)

In [59]:
valid_preds = predict(valid,"clean_text",pain_keywords)

In [60]:
test_preds = predict(test,"clean_text",pain_keywords)

In [61]:
from sklearn.metrics import confusion_matrix
y_true = test.target.values
y_pred = test_preds
confusion_matrix(y_true,y_pred)

array([[232, 100],
       [ 76,  52]])

In [62]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
acc = accuracy_score(y_true,y_pred)
precision = precision_score(y_true,y_pred)
recall = recall_score(y_true,y_pred)

In [63]:
print(f"Accuracy: {acc*100}; Precision:{precision*100}; Recall:{recall*100}")

Accuracy: 61.73913043478261; Precision:34.21052631578947; Recall:40.625


In [64]:
print(classification_report(y_true, y_pred, target_names=["N","Y"]))

              precision    recall  f1-score   support

           N       0.75      0.70      0.73       332
           Y       0.34      0.41      0.37       128

    accuracy                           0.62       460
   macro avg       0.55      0.55      0.55       460
weighted avg       0.64      0.62      0.63       460



#### Spaced Match AND Sentiment without preprocessing

In [65]:
train_preds = predict_with_space(train,"text",pain_keywords)

In [66]:
valid_preds = predict_with_space(valid,"text",pain_keywords)

In [67]:
test_preds = predict_with_space(test,"text",pain_keywords)

In [68]:
from sklearn.metrics import confusion_matrix
y_true = test.target.values
y_pred = test_preds
confusion_matrix(y_true,y_pred)

array([[249,  83],
       [ 82,  46]])

In [69]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
acc = accuracy_score(y_true,y_pred)
precision = precision_score(y_true,y_pred)
recall = recall_score(y_true,y_pred)

In [70]:
print(f"Accuracy: {acc*100}; Precision:{precision*100}; Recall:{recall*100}")

Accuracy: 64.13043478260869; Precision:35.65891472868217; Recall:35.9375


In [71]:
print(classification_report(y_true, y_pred, target_names=["N","Y"]))

              precision    recall  f1-score   support

           N       0.75      0.75      0.75       332
           Y       0.36      0.36      0.36       128

    accuracy                           0.64       460
   macro avg       0.55      0.55      0.55       460
weighted avg       0.64      0.64      0.64       460



#### Spaced Match AND Sentiment after preprocessing

In [72]:
train_preds = predict_with_space(train,"clean_text",pain_keywords)

In [73]:
test_preds = predict_with_space(valid,"clean_text",pain_keywords)

In [74]:
preds = predict_with_space(test,"clean_text",pain_keywords)

In [75]:
from sklearn.metrics import confusion_matrix
y_true = test.target.values
y_pred = test_preds
confusion_matrix(y_true,y_pred)

array([[237,  95],
       [ 88,  40]])

In [76]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
acc = accuracy_score(y_true,y_pred)
precision = precision_score(y_true,y_pred)
recall = recall_score(y_true,y_pred)

In [77]:
print(f"Accuracy: {acc*100}; Precision:{precision*100}; Recall:{recall*100}")

Accuracy: 60.21739130434782; Precision:29.629629629629626; Recall:31.25


In [78]:
print(classification_report(y_true, y_pred, target_names=["N","Y"]))

              precision    recall  f1-score   support

           N       0.73      0.71      0.72       332
           Y       0.30      0.31      0.30       128

    accuracy                           0.60       460
   macro avg       0.51      0.51      0.51       460
weighted avg       0.61      0.60      0.61       460

