In [23]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [22]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\byrdw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\byrdw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
df = pd.read_csv('tweets.csv')
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
8716,Ipad everywhere. #SXSW {link},iPad,Positive emotion
8717,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
8718,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
8719,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [25]:
print(df.head())

                                          tweet_text  \
0  .@wesley83 I have a 3G iPhone. After 3 hrs twe...   
1  @jessedee Know about @fludapp ? Awesome iPad/i...   
2  @swonderlin Can not wait for #iPad 2 also. The...   
3  @sxsw I hope this year's festival isn't as cra...   
4  @sxtxstate great stuff on Fri #SXSW: Marissa M...   

  emotion_in_tweet_is_directed_at  \
0                          iPhone   
1              iPad or iPhone App   
2                            iPad   
3              iPad or iPhone App   
4                          Google   

  is_there_an_emotion_directed_at_a_brand_or_product  
0                                   Negative emotion  
1                                   Positive emotion  
2                                   Positive emotion  
3                                   Negative emotion  
4                                   Positive emotion  


In [26]:
tweet = 'Foursquare ups the game, #ipad  just in time for #SXSW http://j.mp/grN7pK) - Still prefer @Gowalla by far, best looking Android app to date.'

In [27]:
tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)

In [28]:
tweet

'Foursquare ups the game, #ipad  just in time for #SXSW  - Still prefer @Gowalla by far, best looking Android app to date.'

In [32]:
def clean_tweet(tweet):
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#', '', tweet)
    tweet = tweet.lower()
    tweet = re.sub(r'\W', ' ', tweet)
    tweet = re.sub(r'\s+', ' ', tweet)
    tweet = tweet.strip()
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = tweet.split()
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    stemmed_cleaned_words = [stemmer.stem(word) for word in cleaned_words if word not in stop_words]
    return ' '.join(stemmed_cleaned_words)

In [33]:
tweet

'Foursquare ups the game, #ipad  just in time for #SXSW  - Still prefer @Gowalla by far, best looking Android app to date.'

In [34]:
clean_tweet(tweet)

'foursquar up game ipad time sxsw still prefer far best look android app date'

In [35]:
df['tweet_text'] = df['tweet_text'].astype(str)

In [36]:
df['cleaned_tweet'] = df['tweet_text'].apply(clean_tweet)

Tweets are cleaned and we can move on to the next step.  Now let's train the model.  
Encoding values to make a new column, sentiment.  
0=Undefined
1=Negative
2=Indifferent
3=Positve


In [37]:
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['is_there_an_emotion_directed_at_a_brand_or_product'])

In [39]:
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,cleaned_tweet,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,3g iphon 3 hr tweet rise_austin dead need upgr...,1
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,know awesom ipad iphon app like appreci design...,3
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,wait ipad 2 also sale sxsw,3
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,hope year festiv crashi year iphon app sxsw,1
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,great stuff fri sxsw marissa mayer googl tim r...,3
...,...,...,...,...,...
8716,Ipad everywhere. #SXSW {link},iPad,Positive emotion,ipad everywher sxsw link,3
8717,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,wave buzz rt interrupt regularli schedul sxsw ...,2
8718,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,googl zeiger physician never report potenti ae...,2
8719,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,verizon iphon custom complain time fell back h...,2


In [40]:
X = df['cleaned_tweet']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

Let's build a multiclass classification Logitic Regression

In [42]:
model = LogisticRegression(multi_class='ovr')
model.fit(X_train_tfidf, y_train)

LogisticRegression(multi_class='ovr')

In [43]:
y_pred = model.predict(X_test_tfidf)
print("Multiclass CLassifier Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Multiclass CLassifier Accuracy: 0.6790830945558739
                                    precision    recall  f1-score   support

                      I can't tell       0.00      0.00      0.00        40
                  Negative emotion       0.50      0.03      0.06        94
No emotion toward brand or product       0.70      0.89      0.78      1076
                  Positive emotion       0.60      0.42      0.50       535

                          accuracy                           0.68      1745
                         macro avg       0.45      0.34      0.33      1745
                      weighted avg       0.64      0.68      0.64      1745



  _warn_prf(average, modifier, msg_start, len(result))


Let's check if there is a class imbalance problem.

In [21]:
df['sentiment'].value_counts()

2    5156
3    2869
1     545
0     151
Name: sentiment, dtype: int64

In [23]:
#vectorizer_count = CountVectorizer()
#train_data_X = vectorizer.fit_transform(df)
#vocab = vectorizer.get_feature_names()
#print(train_data_X.toarray())

[[0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]]


In [24]:
#print(vocab)

['cleaned_tweet', 'emotion_in_tweet_is_directed_at', 'is_there_an_emotion_directed_at_a_brand_or_product', 'sentiment', 'tweet_text']


In [25]:
model = LogisticRegression(multi_class='ovr', class_weight='balanced')
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
print("Multiclass CLassifier Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Multiclass CLassifier Accuracy: 0.6131805157593123
                                    precision    recall  f1-score   support

                      I can't tell       0.00      0.00      0.00        40
                  Negative emotion       0.27      0.55      0.37        94
No emotion toward brand or product       0.78      0.65      0.71      1076
                  Positive emotion       0.51      0.59      0.55       535

                          accuracy                           0.61      1745
                         macro avg       0.39      0.45      0.41      1745
                      weighted avg       0.65      0.61      0.63      1745



In [48]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

In [49]:
model = LogisticRegression(multi_class='ovr', class_weight='balanced')
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test_tfidf)
print("Multiclass CLassifier Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Multiclass CLassifier Accuracy: 0.6057306590257879
                                    precision    recall  f1-score   support

                      I can't tell       0.02      0.03      0.02        40
                  Negative emotion       0.27      0.59      0.37        94
No emotion toward brand or product       0.77      0.65      0.70      1076
                  Positive emotion       0.52      0.57      0.54       535

                          accuracy                           0.61      1745
                         macro avg       0.40      0.46      0.41      1745
                      weighted avg       0.65      0.61      0.62      1745



We can see oversampling the minority class negatively impacts the majority class and overall accuracy of our model.

In [52]:
undersample = RandomUnderSampler()
X_train_undersample, y_train_undersample = undersample.fit_resample(X_train_tfidf, y_train)


Undersampling severly impacted our model from an accuracy perspective, but did improve recall on our minority class.

In [53]:
model = LogisticRegression(multi_class='ovr', class_weight='balanced')
model.fit(X_train_undersample, y_train_undersample)
y_pred = model.predict(X_test_tfidf)
print("Multiclass CLassifier Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Multiclass CLassifier Accuracy: 0.39255014326647564
                                    precision    recall  f1-score   support

                      I can't tell       0.04      0.28      0.06        40
                  Negative emotion       0.14      0.50      0.22        94
No emotion toward brand or product       0.72      0.39      0.51      1076
                  Positive emotion       0.40      0.38      0.39       535

                          accuracy                           0.39      1745
                         macro avg       0.32      0.39      0.30      1745
                      weighted avg       0.57      0.39      0.45      1745

