In [136]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

In [137]:
df = pd.read_csv('tweets.csv')
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
8716,Ipad everywhere. #SXSW {link},iPad,Positive emotion
8717,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
8718,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
8719,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


In [138]:
print(df.head())

                                          tweet_text  \
0  .@wesley83 I have a 3G iPhone. After 3 hrs twe...   
1  @jessedee Know about @fludapp ? Awesome iPad/i...   
2  @swonderlin Can not wait for #iPad 2 also. The...   
3  @sxsw I hope this year's festival isn't as cra...   
4  @sxtxstate great stuff on Fri #SXSW: Marissa M...   

  emotion_in_tweet_is_directed_at  \
0                          iPhone   
1              iPad or iPhone App   
2                            iPad   
3              iPad or iPhone App   
4                          Google   

  is_there_an_emotion_directed_at_a_brand_or_product  
0                                   Negative emotion  
1                                   Positive emotion  
2                                   Positive emotion  
3                                   Negative emotion  
4                                   Positive emotion  


In [139]:
tweet = 'Foursquare ups the game, #ipad  just in time for #SXSW http://j.mp/grN7pK) - Still prefer @Gowalla by far, best looking Android app to date.'

In [140]:
tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)

In [141]:
tweet

'Foursquare ups the game, #ipad  just in time for #SXSW  - Still prefer @Gowalla by far, best looking Android app to date.'

In [142]:
def clean_tweet(tweet):
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE) # removing any url's
    tweet = re.sub(r'\@\w+|\#', '', tweet) # removing hastags and usernames
    tweet = tweet.lower() # making everything lowercase
    tweet = re.sub(r'\W', ' ', tweet)
    tweet = re.sub(r'\s+', ' ', tweet)
    
    return tweet
print(clean_tweet(tweet))

foursquare ups the game ipad just in time for sxsw still prefer by far best looking android app to date 


In [143]:
tweet

'Foursquare ups the game, #ipad  just in time for #SXSW  - Still prefer @Gowalla by far, best looking Android app to date.'

In [144]:
clean_tweet(tweet)

'foursquare ups the game ipad just in time for sxsw still prefer by far best looking android app to date '

In [145]:
df['tweet_text'] = df['tweet_text'].astype(str)

In [146]:
df['cleaned_tweet'] = df['tweet_text'].apply(clean_tweet)

Tweets are cleaned and we can move on to the next step.  Now let's train the model.

In [147]:
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['is_there_an_emotion_directed_at_a_brand_or_product'])

In [148]:
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,cleaned_tweet,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,i have a 3g iphone after 3 hrs tweeting at ri...,1
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,know about awesome ipad iphone app that you l...,3
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,can not wait for ipad 2 also they should sale...,3
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,i hope this year s festival isn t as crashy a...,1
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,great stuff on fri sxsw marissa mayer google ...,3
...,...,...,...,...,...
8716,Ipad everywhere. #SXSW {link},iPad,Positive emotion,ipad everywhere sxsw link,3
8717,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,wave buzz rt we interrupt your regularly sched...,2
8718,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,google s zeiger a physician never reported pot...,2
8719,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,some verizon iphone customers complained their...,2


In [149]:
X = df['cleaned_tweet']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [150]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

Let's build a multiclass classification Logitic Regression

In [151]:
model = LogisticRegression(multi_class='ovr')
model.fit(X_train_tfidf, y_train)

LogisticRegression(multi_class='ovr')

In [153]:
y_pred = model.predict(X_test_tfidf)
print("Multiclass CLassifier Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Multiclass CLassifier Accuracy: 0.6808022922636103
                                    precision    recall  f1-score   support

                      I can't tell       0.00      0.00      0.00        40
                  Negative emotion       0.45      0.05      0.10        94
No emotion toward brand or product       0.70      0.89      0.78      1076
                  Positive emotion       0.60      0.43      0.50       535

                          accuracy                           0.68      1745
                         macro avg       0.44      0.34      0.35      1745
                      weighted avg       0.64      0.68      0.64      1745



  _warn_prf(average, modifier, msg_start, len(result))


Let's check if there is a class imbalance problem.