# Read in Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
data=pd.read_csv("/content/train.csv")
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [6]:
data.shape

(8589, 3)

In [7]:
data.shape

(8589, 3)

# Working With Label

In [8]:
data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2672
Negative emotion                       519
I can't tell                             9
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

# Cleaning  steps                                       
removing HTML Tags                                        
extracting emojies    # The pattern re.compile('(?::|;|=)(?:-)?(?:\)|\(|D|P)') is a regular expression                                                   used to match and extract emojis from a given text.                                                
removing special chars,puntuation,sumbols                                                 
lower casing    
removing stopwords                                
tokenization

In [None]:
import nltk
nltk.download('all')

In [11]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [16]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

stopwords_set = set(stopwords.words('english'))
emoji_pattern = re.compile('(?::|;|=)(?:-)?(?:\)|\(|D|P)')

def preprocessing(text):
    if isinstance(text, str):  # Check if text is a string
        text = re.sub('<[^>]*>', '', text)
        emojis = emoji_pattern.findall(text)
        text = re.sub('[\W+]', ' ', text.lower()) + ' '.join(emojis).replace('-', '')

        prter = PorterStemmer()
        text = [prter.stem(word) for word in text.split() if word not in stopwords_set]

        return " ".join(text)
    else:
        return ""  # Return an empty string or handle other types accordingly

data['text'] = data['tweet_text'].apply(lambda x: preprocessing(x))


In [37]:
data['text']

0       wesley83 3g iphon 3 hr tweet rise_austin dead ...
1       jessede know fludapp awesom ipad iphon app lik...
2                   swonderlin wait ipad 2 also sale sxsw
3        sxsw hope year festiv crashi year iphon app sxsw
4       sxtxstate great stuff fri sxsw marissa mayer g...
                              ...                        
8584                             ipad everywher sxsw link
8585    wave buzz rt mention interrupt regularli sched...
8586    googl zeiger physician never report potenti ae...
8587    verizon iphon custom complain time fell back h...
8588    ϡ _ ʋ ҋ _ _ rt mention googl test check offer ...
Name: text, Length: 8589, dtype: object

In [18]:
label_mapping = {
    "No emotion toward brand or product": 0,
    "Positive emotion": 1,
    "Negative emotion": 2,
    "I can't tell": 3
}

# Replace text labels with integer values
data['label'] = data['is_there_an_emotion_directed_at_a_brand_or_product'].map(label_mapping)


# TF-IDF Vertorizer to convert the raw documents into feature matrix

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None,use_idf=True,norm='l2',smooth_idf=True)
y=data.label.values
x=tfidf.fit_transform(data.text)

# Training Machine Learning Model 

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.25,shuffle=False)

In [21]:
from sklearn.linear_model import LogisticRegressionCV
clf=LogisticRegressionCV(cv=6,scoring='accuracy',random_state=0,n_jobs=-1,verbose=3,max_iter=500).fit(X_train,y_train)
y_pred = clf.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  1.3min finished


# Accuracy

In [22]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7304469273743017


In [40]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Example code to calculate other metrics
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the additional metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)


Precision: 0.7154967678677481
Recall: 0.7304469273743017
F1 Score: 0.7132730342215204
Confusion Matrix:
 [[1206  151   15    0]
 [ 319  341    3    0]
 [  64   22   22    0]
 [   3    2    0    0]]


  _warn_prf(average, modifier, msg_start, len(result))


# Pickle files

In [23]:
import pickle
pickle.dump(clf,open('clf.pkl','wb'))
pickle.dump(tfidf,open('tfidf.pkl','wb'))

# prediction

In [32]:
def prediction(comment):
    preprocessed_comment = preprocessing(comment)
    print(preprocessed_comment)
    comment_list = [preprocessed_comment]  # Wrap the preprocessed comment in a list
    comment_vector = tfidf.transform(comment_list)
    prediction = clf.predict(comment_vector)[0]
    return prediction



prediction = prediction('mention rt mention googl launch new social network call circl possibl today link sxsw report buzz')

mention rt mention googl launch new social network call circl possibl today link sxsw report buzz


In [33]:
if prediction == 1:
    print("Positive emotion")
elif prediction == 2:
    print("Negative emotion")
elif prediction == 0:
    print("No emotion toward brand or product")
else:
    print("I can't tell")


No emotion toward brand or product
