In [52]:
# import statements
import pandas as pd
from zipfile import ZipFile
from io import BytesIO
from urllib.request import urlopen
import os
import spacy
import re
import string
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.porter import * 

# Feature Engineering
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
# download the training data from Sentiment140
zipurl = 'http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip'
with urlopen(zipurl) as zipresp:
    with ZipFile(BytesIO(zipresp.read())) as zfile:
        zfile.extractall()

In [8]:
train = pd.read_csv('training.1600000.processed.noemoticon.csv', 
                    header = None,
                    names = ['score', 'id', 'date', 'query', 'user', 'tweet'],
                    encoding='latin-1')
print(train.head())

   score          id                          date     query             user  \
0      0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  _TheSpecialOne_   
1      0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY    scotthamilton   
2      0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY         mattycus   
3      0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          ElleCTF   
4      0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY           Karoli   

                                               tweet  
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1  is upset that he can't update his Facebook by ...  
2  @Kenichan I dived many times for the ball. Man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  


In [9]:
# extract the information that we want
data_train = train[['score', 'tweet']]
print(data_train.head)

<bound method NDFrame.head of          score                                              tweet
0            0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1            0  is upset that he can't update his Facebook by ...
2            0  @Kenichan I dived many times for the ball. Man...
3            0    my whole body feels itchy and like its on fire 
4            0  @nationwideclass no, it's not behaving at all....
...        ...                                                ...
1599995      4  Just woke up. Having no school is the best fee...
1599996      4  TheWDB.com - Very cool to hear old Walt interv...
1599997      4  Are you ready for your MoJo Makeover? Ask me f...
1599998      4  Happy 38th Birthday to my boo of alll time!!! ...
1599999      4  happy #charitytuesday @theNSPCC @SparksCharity...

[1600000 rows x 2 columns]>


In [10]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
sentencizer = nlp.add_pipe("sentencizer")

def tweet_processing(comment):
    
    modComm = comment
    
    # Remove HTML special entities (e.g. &amp;)
    modComm = re.sub(r'\&\w*;', '', modComm)
    # Convert @username to AT_USER
    modComm = re.sub('@[^\s]+','',modComm)
    # Remove tickers
    modComm = re.sub(r'\$\w*', '', modComm)
    # To lowercase
    modComm = modComm.lower()
    # Remove hyperlinks
    modComm = re.sub(r'https?:\/\/.*\/\w*', '', modComm)
    # Remove hashtags
    modComm = re.sub(r'#\w*', '', modComm)
    # Remove whitespace (including new line characters)
    modComm = re.sub(r'\s\s+', ' ', modComm)
    # Remove single space remaining at the front of the tweet.
    modComm = modComm.lstrip(' ') 
    
    return modComm

def text_processing(tweet):
    
     # Check characters to see if they are in punctuation
    nopunc = [char for char in list(tweet) if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.lower().split() if word.lower() not in stopwords.words('english')]
    
# Lexicon normalisation with Stemming 
def stemming(tokens):
    """
    Takes in a string of text, then performs the following:
    1. Replace words for its root based on orter Stemmer rule.
    2. Returns normalised text
    """
    stemmer = PorterStemmer()
    x = [stemmer.stem(w) for w in tokens]
    
    return ' '.join(x)

In [28]:
mat = []
for i in range(0, len(data_train), 1):
    comment = stemming(text_processing(tweet_processing(data_train['tweet'][i])))
    mat.append([data_train['score'][i], comment])

In [33]:
print(len(mat))
print(len(mat[0]))
print(len(mat[1]))
print(mat[0:10])

1600000
2
2
[[0, 'awww that bummer shoulda got david carr third day'], [0, 'upset cant updat facebook text might cri result school today also blah'], [0, 'dive mani time ball manag save 50 rest go bound'], [0, 'whole bodi feel itchi like fire'], [0, 'behav im mad cant see'], [0, 'whole crew'], [0, 'need hug'], [0, 'hey long time see ye rain bit bit lol im fine thank how'], [0, 'nope didnt'], [0, 'que muera']]


In [36]:
mat = pd.DataFrame(mat, columns=['score', 'tokens'])

In [37]:
mat.to_csv('cleaned_train_data.csv', index=False)

In [39]:
mat.dropna(subset=['tokens'], inplace=True)

In [44]:
analyser = SentimentIntensityAnalyzer()

def polarity_scores_all(tweet):
  '''
  Takes string of text to:
  1. Gets sentiment metrics
  2. Returns negative, neutral, positive 
  and compound scores as lists.
  '''
  neg, neu, pos, compound = [], [], [], []
  analyser = SentimentIntensityAnalyzer()
  
  for text in tweet:
    dict_ = analyser.polarity_scores(text)
    neg.append(dict_['neg'])
    neu.append(dict_['neu'])
    pos.append(dict_['pos'])
    compound.append(dict_['compound'])
  
  return neg, neu, pos, compound

In [45]:
all_scores = polarity_scores_all(mat.tokens.values)
mat['neg_scores'] = all_scores[0]
mat['neu_scores'] = all_scores[1]
mat['pos_scores'] = all_scores[2]
mat['compound_scores'] = all_scores[3]

In [46]:
mat.head(4)

Unnamed: 0,score,tokens,neg_scores,neu_scores,pos_scores,compound_scores
0,0,awww that bummer shoulda got david carr third day,0.245,0.755,0.0,-0.3818
1,0,upset cant updat facebook text might cri resul...,0.286,0.714,0.0,-0.4588
2,0,dive mani time ball manag save 50 rest go bound,0.0,0.738,0.262,0.4939
3,0,whole bodi feel itchi like fire,0.27,0.449,0.281,0.0258


In [47]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tony_niu/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [53]:
# To transform pos tags to readable tags
pos_family = {  
    'NOUN' : ['NN','NNS','NNP'], # Removed 'NNPS'
    'PRON' : ['PRP','PRP$','WP','WP$'],
    'VERB' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'ADJ' :  ['JJ','JJR','JJS'],
    'ADV' : ['RB','RBR','RBS','WRB']
}

def count_pos_tag(tweets):
    '''
    Takes string of text to:
    1. Processes text and attaches POS tags
    2. Input the dictionary of POS tags into a Counter.
    3. Returns list of POS tags with occurrence number.
    '''
    total_count = []
    for s in tweets:
        partial_count = {}
        s = s.split()
        count_pos = Counter(dict(nltk.pos_tag(s)).values())
        
        for item, value in count_pos.items():
            partial_count[item] = partial_count.get(item, 0) + 1
            
        total_count.append(partial_count)
        
    return total_count

In [54]:
# Retrieve POS tags with occurrence 
total_count = count_pos_tag(mat.tokens.values)

# As dataframe 
pos_df = pd.DataFrame(total_count)

# Remove unwanted characters
pos_df = pos_df.drop(['$', 'IN'], axis = 1) #drop '$' if needed

# Inspection
pos_df.columns

Index(['NN', 'VBD', 'JJ', 'MD', 'VB', 'RB', 'NNS', 'VBP', 'CD', 'WRB', 'VBZ',
       'VBN', 'UH', 'CC', 'DT', 'PRP', 'JJR', 'PRP$', 'RP', 'JJS', 'NNP',
       'VBG', 'EX', 'RBR', 'RBS', 'FW', 'TO', 'WP', 'WDT', 'SYM', 'PDT', 'WP$',
       '''', 'LS', 'POS', '``', 'NNPS'],
      dtype='object')

In [55]:
# Change tags to readable tags

pos_df['NOUN'] = pos_df[pos_family['NOUN']].sum(axis=1)
pos_df['PRON'] = pos_df[pos_family['PRON']].sum(axis=1)
pos_df['VERB'] = pos_df[pos_family['VERB']].sum(axis=1)
pos_df['ADJ'] = pos_df[pos_family['ADJ']].sum(axis=1)
pos_df['ADV'] = pos_df[pos_family['ADV']].sum(axis=1)

pos_df = pos_df[['NOUN', 'PRON', 'VERB', 'ADJ', 'ADV']]

In [57]:
# Add to end of original data set as new features 
mat = pd.concat([mat, pos_df], axis = 1)

# Deal with NaN
mat = mat.fillna(value=0.0)

#train = train.fillna(value=0.0)
mat.shape

# Remove duplicates 
mat.drop_duplicates(subset=['tokens'], inplace=True)

In [58]:
# Check new features
mat.info()

mat.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1476969 entries, 0 to 1599998
Data columns (total 11 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   score            1476969 non-null  int64  
 1   tokens           1476969 non-null  object 
 2   neg_scores       1476969 non-null  float64
 3   neu_scores       1476969 non-null  float64
 4   pos_scores       1476969 non-null  float64
 5   compound_scores  1476969 non-null  float64
 6   NOUN             1476969 non-null  float64
 7   PRON             1476969 non-null  float64
 8   VERB             1476969 non-null  float64
 9   ADJ              1476969 non-null  float64
 10  ADV              1476969 non-null  float64
dtypes: float64(9), int64(1), object(1)
memory usage: 135.2+ MB


Unnamed: 0,score,tokens,neg_scores,neu_scores,pos_scores,compound_scores,NOUN,PRON,VERB,ADJ,ADV
0,0,awww that bummer shoulda got david carr third day,0.245,0.755,0.0,-0.3818,1.0,0.0,1.0,1.0,0.0
1,0,upset cant updat facebook text might cri resul...,0.286,0.714,0.0,-0.4588,1.0,0.0,2.0,1.0,1.0
2,0,dive mani time ball manag save 50 rest go bound,0.0,0.738,0.262,0.4939,2.0,0.0,1.0,1.0,0.0
3,0,whole bodi feel itchi like fire,0.27,0.449,0.281,0.0258,2.0,0.0,1.0,1.0,0.0
4,0,behav im mad cant see,0.444,0.556,0.0,-0.4939,1.0,0.0,0.0,1.0,0.0


In [59]:
mat.to_csv('feat_eng_train_data.csv', index = False)

In [60]:
file_path = 'training.1600000.processed.noemoticon.csv'
os.remove(file_path)
file_path = 'testdata.manual.2009.06.14.csv'
os.remove(file_path)