In [1]:
import pandas as pd 
import numpy as np 
import nltk 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument 
nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

import regex as re 
from gensim.parsing.preprocessing import remove_stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


"In a variation on the popular task of sentiment analysis, this dataset contains labels for the emotional content (such as happiness, sadness, and anger) of texts. Hundreds to thousands of examples across 13 labels. A subset of this data is used in an experiment we uploaded to Microsoft’s Cortana Intelligence Gallery."

In [2]:
#downloding data
!wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv
!wget -P DATAPATH https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv
!ls -lah DATAPATH

--2021-01-28 15:15:23--  https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2479133 (2.4M) [text/plain]
Saving to: ‘DATAPATH/train_data.csv’


2021-01-28 15:15:23 (30.3 MB/s) - ‘DATAPATH/train_data.csv’ saved [2479133/2479133]

--2021-01-28 15:15:23--  https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 20

In [3]:
train_df = pd.read_csv('/content/DATAPATH/train_data.csv')
train_df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [4]:
train_df['sentiment'].value_counts()

worry         7433
neutral       6340
sadness       4828
happiness     2986
love          2068
surprise      1613
hate          1187
fun           1088
relief        1021
empty          659
enthusiasm     522
boredom        157
anger           98
Name: sentiment, dtype: int64

In [5]:
#Lets take the first 3 categories and leave out the rest 
top_3 = ['neutral','happiness','worry']
df_subset = train_df[train_df['sentiment'].isin(top_3)]
df_subset.head()

Unnamed: 0,sentiment,content
4,neutral,@dannycastillo We want to trade with someone w...
5,worry,Re-pinging @ghostridah14: why didn't you go to...
7,worry,Hmmm. http://www.djhero.com/ is down
10,neutral,cant fall asleep
11,worry,Choked on her retainers


In [6]:
df_subset['sentiment'].value_counts()

worry        7433
neutral      6340
happiness    2986
Name: sentiment, dtype: int64

## Text pre-processing:

Tweets are different. Somethings to consider:

Removing @mentions, and urls perhaps?
using NLTK Tweet tokenizer instead of a regular one
stopwords, numbers as usual.

In [7]:
#strip_handles removes personal information such as twitter handles, which don't
#contribute to emotion in the tweet. preserve_case=False converts everything to lowercase.
tweeter = TweetTokenizer(strip_handles=True, preserve_case=False)
mystopwords = set(stopwords.words('english'))

#Function to tokenize tweets, remove stopwords and numbers. 
#Keeping punctuations and emoticon symbols could be relevant for this task!
def preprocess_corpus(texts):
  def remove_stop_digits(tokens):
    return [token for token in tokens if token not in mystopwords and not token.isdigit()]
    #This return statement below uses the above function to process twitter tokenizer output further. 
  return [remove_stop_digits(tweeter.tokenize(content)) for content in texts]

In [8]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']

def clean_text(text):
  text = str(text)
  for punc in puncts:
      if punc in text:
          text = text.replace(punc, ' ')
  return text

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
    
df_subset['content'] = df_subset['content'].apply(lambda x: remove_emoji(x)) 
df_subset['content'] = df_subset['content'].apply(lambda x: clean_text(x)) 
df_subset['content'] = df_subset['content'].apply(lambda x: re.sub(r'http\S+','',x))
df_subset['content'] = df_subset['content'].apply(lambda x: re.sub("@[\w]*", '', x))
df_subset['content'] = df_subset['content'].apply(lambda x:' '.join(x.split()))
df_subset['content'] = df_subset['content'].apply(lambda x: remove_stopwords(x))

#df_subset contains only the three categories we choose
mydata = preprocess_corpus(df_subset['content'])
mycats = df_subset['sentiment']
print(len(mydata), len(mycats))


16759 16759


In [47]:
#Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(mydata, mycats, random_state=2018)

#Preparing training data in Doc2Vec format:
train_doc2vec =  [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(X_train)]
#Train a doc2vec model to learn tweet representations. Use only training data!!
model = Doc2Vec(vector_size=50, alpha=0.025, min_count=5, dim=1, epochs=100)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)
model.save('d2v.model')  

In [48]:
#Infer the feature representation for training and test data using the trained model
model = Doc2Vec.load('d2v.model')
#infer in multiple steps to get a stable representation
train_vectors = [model.infer_vector(list_of_tokens, steps=50)for list_of_tokens in X_train]
test_vectors =  [model.infer_vector(list_of_tokens, steps=50)for list_of_tokens in X_test]

#Use any classifier to train the model
from sklearn.linear_model import LogisticRegression

myclass = LogisticRegression(class_weight='balanced')
myclass.fit(train_vectors, y_train)
preds = myclass.predict(test_vectors)

from sklearn import metrics
print(metrics.classification_report(y_test, preds))


              precision    recall  f1-score   support

   happiness       0.34      0.54      0.42       799
     neutral       0.48      0.43      0.45      1551
       worry       0.58      0.49      0.53      1840

    accuracy                           0.48      4190
   macro avg       0.47      0.49      0.47      4190
weighted avg       0.50      0.48      0.48      4190



Now the performance of this models seems rather poor.  There could be couple of interertations for this poor results.

**1.** Unlike full news articles or even well-formed sentences, tweets contain very little data pper instance.

**2.** Further people write with a wide variety in spelling and syntax when they tweet. There are a lot of emoticons in different forms which our feature representation failed to capture.

