In [10]:
import pandas as pd 
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import torch

START_TOKEN = '<START>'
END_TOKEN = '<END>'


### Data Preprocessing

In [29]:
data = pd.read_csv('sentiment_tweets3.csv')
data.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [30]:
data['label (depression result)'].value_counts()

0    8000
1    2314
Name: label (depression result), dtype: int64

In [31]:
# WORDNET LEMMATIZER (with appropriate pos tags)
 
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
 
lemmatizer = WordNetLemmatizer()
 
# Define function to lemmatize each word with its POS tag
 
# POS_TAGGER_FUNCTION : TYPE 1
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
 
sentence = 'the cat is sitting with the bats on the striped mat under many badly flying geese'
def lemmatizer_with_pos_tagging(sentence):
    # tokenize the sentence and find the POS tag for each token
    pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    
    # print(pos_tagged)
    #>[('the', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('sitting', 'VBG'), ('with', 'IN'), 
    # ('the', 'DT'), ('bats', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('striped', 'JJ'), 
    # ('mat', 'NN'), ('under', 'IN'), ('many', 'JJ'), ('flying', 'VBG'), ('geese', 'JJ')]
    
    # As you may have noticed, the above pos tags are a little confusing.
    
    # we use our own pos_tagger function to make things simpler to understand.
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
    # print(wordnet_tagged)
    #>[('the', None), ('cat', 'n'), ('is', 'v'), ('sitting', 'v'), ('with', None), 
    # ('the', None), ('bats', 'n'), ('on', None), ('the', None), ('striped', 'a'), 
    # ('mat', 'n'), ('under', None), ('many', 'a'), ('flying', 'v'), ('geese', 'a')]
    
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    lemmatized_sentence = " ".join(lemmatized_sentence)
    return lemmatized_sentence


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/aditiganeshjoshi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [32]:
data['message to examine'][1]

'is reading manga  http://plurk.com/p/mzp1e'

In [33]:
lemmatizer_with_pos_tagging(data['message to examine'][1])

'be read manga http : //plurk.com/p/mzp1e'

In [34]:
data['lemmatized_message'] = data['message to examine'].apply(lambda x:lemmatizer_with_pos_tagging(x))

In [35]:
data['lemmatized_message'][:5]

0    just have a real good moment . i miss him so m...
1             be read manga http : //plurk.com/p/mzp1e
2    @ comeagainjen http : //twitpic.com/2y2lx - ht...
3    @ lapcat Need to send 'em to my accountant tom...
4      ADD ME ON MYSPACE ! ! ! myspace.com/LookThunder
Name: lemmatized_message, dtype: object

In [36]:
#case normalization
data['normalized_message'] = data['lemmatized_message'].apply(lambda x:x.lower())
data['normalized_message'][4]

'add me on myspace ! ! ! myspace.com/lookthunder'

In [37]:
def remove_accents(text):
    accents = re.compile(u"[\u0300-\u036F]|é|è")
    text = accents.sub(u"e", text)
    return text

In [38]:
# accent removal
data['accentless_message'] = data['normalized_message'].apply(lambda x:remove_accents(x))

In [39]:
#tokenization
data['tokenized_message'] = data['accentless_message'].apply(word_tokenize)
data['tokenized_message'][4]

['add', 'me', 'on', 'myspace', '!', '!', '!', 'myspace.com/lookthunder']

In [40]:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
 
def remove_stop_words(word_tokens):
    filtered_sentence = []
    
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence
 



In [41]:
data['tokenized_message_final'] = data['tokenized_message'].apply(lambda x:remove_stop_words(x))
data['tokenized_message_final'][4]

['add', 'myspace', '!', '!', '!', 'myspace.com/lookthunder']

In [42]:
data['tokenized_message_final'][0]

['real', 'good', 'moment', '.', 'miss', 'much', ',']

In [43]:
data.to_csv('processed_data.csv')

#### Feature Engineering

In [60]:
data_ = pd.read_csv('processed_data.csv')

In [61]:
data_.columns

Index(['Unnamed: 0', 'Index', 'message to examine',
       'label (depression result)', 'lemmatized_message', 'normalized_message',
       'accentless_message', 'tokenized_message', 'tokenized_message_final'],
      dtype='object')

In [12]:
data_['tokenized_message_final'] = data_['tokenized_message_final'].apply(lambda x: x[1:-1].split(','))
data_['tokenized_message_final'] 

0        ['real',  'good',  'moment',  '.',  'miss',  '...
1        ['read',  'manga',  'http',  ':',  '//plurk.co...
2        ['@',  'comeagainjen',  'http',  ':',  '//twit...
3        ['@',  'lapcat',  'need',  'send',  "'em",  'a...
4        ['add',  'myspace',  '!',  '!',  '!',  'myspac...
                               ...                        
10309    ['depression',  'g',  'herbo',  'mood',  ', ',...
10310    ['depression',  'succumb',  'brain',  'make', ...
10311    ['ketamine',  'nasal',  'spray',  'shows',  'p...
10312    ['dont',  'mistake',  'bad',  'day',  'depress...
10313                                                ['0']
Name: tokenized_message_final, Length: 10314, dtype: object

In [13]:
def read_corpus(tokenized_texts):
    """ 
        Params:
            tokenized_texts: pandas Series of lists of tokens
        Return:
            list of lists, with words from each of the processed texts
    """
    return [[START_TOKEN] + text + [END_TOKEN] for text in tokenized_texts]

In [14]:
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = -1

    # ------------------
    # Write your implementation here.
    all_corpus_words = [y for x in corpus for y in x]
    corpus_words = sorted(set(all_corpus_words))
    num_corpus_words = len(corpus_words)
    # end of implementation

    return corpus_words, num_corpus_words

In [15]:
def compute_co_occurrence_matrix(corpus, window_size=4):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).

        Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
              number of co-occurring words.

              For example, if we take the document "START All that glitters is not gold END" with window size of 4,
              "All" will co-occur with "START", "that", "glitters", "is", and "not".

        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (numpy matrix of shape (number of corpus words, number of corpus words)):
                Co-occurence matrix of word counts.
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    words, num_words = distinct_words(corpus)
    M = None
    word2Ind = {}

    # ------------------
    # Write your implementation here.
    for i in range(len(words)):
        word2Ind[words[i]] = i
    M = np.zeros((num_words,num_words))
    for doc in corpus:
      for i in range(len(doc)):
        ind1 = word2Ind[doc[i]]
        start = i - window_size
        end = i + window_size + 1
        if i < window_size:
          start = 0
        if i > len(doc) - window_size - 1:
          end = len(doc)
        for j in range(start, end):
          if j == i:
            continue
          ind2 = word2Ind[doc[j]]
          M[ind1, ind2] += 1

    # end of implementation

    return M, word2Ind

In [16]:
corpus = read_corpus(data_['tokenized_message_final'])
corpus

[['<START>',
  "'real'",
  " 'good'",
  " 'moment'",
  " '.'",
  " 'miss'",
  " 'much'",
  " '",
  "'",
  '<END>'],
 ['<START>',
  "'read'",
  " 'manga'",
  " 'http'",
  " ':'",
  " '//plurk.com/p/mzp1e'",
  '<END>'],
 ['<START>',
  "'@'",
  " 'comeagainjen'",
  " 'http'",
  " ':'",
  " '//twitpic.com/2y2lx'",
  " '-'",
  " 'http'",
  " ':'",
  " '//www.youtube.com/watch'",
  " '?'",
  " 'v=zogfqvh2me8'",
  '<END>'],
 ['<START>',
  "'@'",
  " 'lapcat'",
  " 'need'",
  " 'send'",
  ' "\'em"',
  " 'accountant'",
  " 'tomorrow'",
  " '.'",
  " 'oddly'",
  " '",
  "'",
  ' "n\'t"',
  " 'even'",
  " 'refer'",
  " 'tax'",
  " '.'",
  " 'support'",
  " 'evidence'",
  " '",
  "'",
  " 'though'",
  " '.'",
  '<END>'],
 ['<START>',
  "'add'",
  " 'myspace'",
  " '!'",
  " '!'",
  " '!'",
  " 'myspace.com/lookthunder'",
  '<END>'],
 ['<START>',
  "'sleepy'",
  " '.'",
  " 'good'",
  " 'time'",
  " 'tonight'",
  " 'though'",
  '<END>'],
 ['<START>',
  "'@'",
  " 'silkcharm'",
  " ':'",
  " '#'",
 

In [17]:
corpus_words, num_corpus_words = distinct_words(corpus)

num_corpus_words

23163

In [18]:
M, word2Ind = compute_co_occurrence_matrix(corpus, window_size=2)
M

array([[ 0.,  0.,  0., ...,  0.,  1.,  1.],
       [ 0.,  6.,  0., ...,  0., 31., 11.],
       [ 0.,  0.,  0., ...,  0.,  2.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1., 31.,  2., ...,  0.,  0., 42.],
       [ 1., 11.,  0., ...,  0., 42.,  0.]])

In [19]:
type(M)

numpy.ndarray

In [20]:
M.shape

(23163, 23163)

In [21]:
def calculate_PPMI(m, kind, k=2):

   alpha = 0.75
   # print(m)
   sum_m = torch.sum(m)
   # print(sum_m)
   sum_m_weighted = torch.sum(torch.float_power(m,alpha))
   # print(sum_m_weighted)

   p_ij_ = {}
   for i in range(m.shape[0]):
         for j in range(m.shape[1]):
            if m[i,j]!=0:
               p_ij_[str(i)+str(j)] = m[i,j]/sum_m
   row_prob = {}
   if kind == 'weighted':
      
      col_prob_weighted = {}
      for i in range(m.shape[0]):
         row_prob[str(i)] = torch.sum(m[i,:])/sum_m
      for j in range(m.shape[1]):
         col_prob_weighted[str(j)] = torch.sum(torch.float_power(m[:,j],alpha))/sum_m_weighted
      for i in range(m.shape[0]):
         for j in range(m.shape[1]):
            if int(m[i,j])!=0 and row_prob[str(i)]!=0 and col_prob_weighted[str(j)]!=0:
               p_i = row_prob[str(i)]
               p_j = col_prob_weighted[str(j)]
               p_ij = p_ij_[str(i)+str(j)]
               m[i, j] = max(torch.log2(p_ij/(p_i*p_j)),0)


   if kind == 'add_k':
      
      col_prob = {}
      m += k
      
      for i in range(m.shape[0]):
         row_prob[str(i)] = torch.sum(m[i,:])/sum_m
      for j in range(m.shape[1]):
         col_prob[str(j)] = torch.sum(m[:,j])/sum_m

      for i in range(m.shape[0]):
         for j in range(m.shape[1]):
            if m[i,j] != k:
               p_ij_[str(i)+str(j)] = m[i,j]/sum_m
      for i in range(m.shape[0]):
         for j in range(m.shape[1]):
            p_i = row_prob[str(i)]
            p_j = col_prob[str(j)]
            if m[i,j] != k:
               p_ij = p_ij_[str(i)+str(j)]
            else:
               p_ij = k/sum_m
            m[i, j] = max(torch.log2(p_ij/(p_i*p_j)),0)

   return m

In [22]:
mat = torch.tensor([[1,2,3],[3,4,1],[2,3,4]])
mat

tensor([[1, 2, 3],
        [3, 4, 1],
        [2, 3, 4]])

In [23]:
calculate_PPMI(mat, 'add_k')

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])

In [24]:
M = torch.tensor(M)
M

tensor([[ 0.,  0.,  0.,  ...,  0.,  1.,  1.],
        [ 0.,  6.,  0.,  ...,  0., 31., 11.],
        [ 0.,  0.,  0.,  ...,  0.,  2.,  0.],
        ...,
        [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
        [ 1., 31.,  2.,  ...,  0.,  0., 42.],
        [ 1., 11.,  0.,  ...,  0., 42.,  0.]], dtype=torch.float64)

In [25]:
M.shape

torch.Size([23163, 23163])

In [19]:
M

tensor([[ 0.,  0.,  0.,  ...,  0.,  1.,  1.],
        [ 0.,  6.,  0.,  ...,  0., 31., 11.],
        [ 0.,  0.,  0.,  ...,  0.,  2.,  0.],
        ...,
        [ 0.,  0.,  0.,  ...,  0.,  0.,  0.],
        [ 1., 31.,  2.,  ...,  0.,  0., 42.],
        [ 1., 11.,  0.,  ...,  0., 42.,  0.]], dtype=torch.float64)

In [20]:
m = M[:100,:100]
m.shape

torch.Size([100, 100])

In [21]:
m

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 6., 0.,  ..., 0., 1., 1.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 2., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

In [22]:
%%time
weighted_ppmi_matrix = calculate_PPMI(m, 'weighted')
weighted_ppmi_matrix

CPU times: user 95 ms, sys: 2.12 ms, total: 97.1 ms
Wall time: 96.2 ms


tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.4173],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 3.2400, 0.0000],
        [0.0000, 1.6546, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       dtype=torch.float64)

In [23]:
data_.head()

Unnamed: 0.1,Unnamed: 0,Index,message to examine,label (depression result),lemmatized_message,normalized_message,accentless_message,tokenized_message,tokenized_message_final
0,0,106,just had a real good moment. i missssssssss hi...,0,just have a real good moment . i miss him so m...,just have a real good moment . i miss him so m...,just have a real good moment . i miss him so m...,"['just', 'have', 'a', 'real', 'good', 'moment'...","['real', 'good', 'moment', '.', 'miss', '..."
1,1,217,is reading manga http://plurk.com/p/mzp1e,0,be read manga http : //plurk.com/p/mzp1e,be read manga http : //plurk.com/p/mzp1e,be read manga http : //plurk.com/p/mzp1e,"['be', 'read', 'manga', 'http', ':', '//plurk....","['read', 'manga', 'http', ':', '//plurk.co..."
2,2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0,@ comeagainjen http : //twitpic.com/2y2lx - ht...,@ comeagainjen http : //twitpic.com/2y2lx - ht...,@ comeagainjen http : //twitpic.com/2y2lx - ht...,"['@', 'comeagainjen', 'http', ':', '//twitpic....","['@', 'comeagainjen', 'http', ':', '//twit..."
3,3,288,@lapcat Need to send 'em to my accountant tomo...,0,@ lapcat Need to send 'em to my accountant tom...,@ lapcat need to send 'em to my accountant tom...,@ lapcat need to send 'em to my accountant tom...,"['@', 'lapcat', 'need', 'to', 'send', ""'em"", '...","['@', 'lapcat', 'need', 'send', ""'em"", 'a..."
4,4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0,ADD ME ON MYSPACE ! ! ! myspace.com/LookThunder,add me on myspace ! ! ! myspace.com/lookthunder,add me on myspace ! ! ! myspace.com/lookthunder,"['add', 'me', 'on', 'myspace', '!', '!', '!', ...","['add', 'myspace', '!', '!', '!', 'myspac..."


In [39]:
def create_features_from_ppmi(tokens, m, word2Ind):
    print(tokens)
    feat = np.zeros(m.shape[1])
    for token in tokens:
        ind = word2Ind[token]
        # print(ind)
        feat = np.add(feat, m[ind,:])
    return feat


In [38]:
weighted_ppmi_matrix = torch.load('weighted_ppmi_matrix.pt')
features = data_['tokenized_message_final'].apply(lambda x: create_features_from_ppmi(x, M, word2Ind))
print(type(features))

["'real'", " 'good'", " 'moment'", " '.'", " 'miss'", " 'much'", " '", "'"]
22566
8413
12816
218
12678
13059
153
21174
["'read'", " 'manga'", " 'http'", " ':'", " '//plurk.com/p/mzp1e'"]
22562
12019
9395
1871
710
["'@'", " 'comeagainjen'", " 'http'", " ':'", " '//twitpic.com/2y2lx'", " '-'", " 'http'", " ':'", " '//www.youtube.com/watch'", " '?'", " 'v=zogfqvh2me8'"]
21250
5092
9395
1871
849
169
9395
1871
1295
1886
19726
["'@'", " 'lapcat'", " 'need'", " 'send'", ' "\'em"', " 'accountant'", " 'tomorrow'", " '.'", " 'oddly'", " '", "'", ' "n\'t"', " 'even'", " 'refer'", " 'tax'", " '.'", " 'support'", " 'evidence'", " '", "'", " 'though'", " '.'"]
21250
11155
13338
16848
30
2049
19035
218
13809
153
21174
132
7192
15852
18448
218
18192
7218
153
21174
18823
218
["'add'", " 'myspace'", " '!'", " '!'", " '!'", " 'myspace.com/lookthunder'"]
21269
13173
154
154
154
13174
["'sleepy'", " '.'", " 'good'", " 'time'", " 'tonight'", " 'though'"]
22723
218
8413
18897
19048
18823
["'@'", " 'silkcharm

In [44]:
type(features[0])

torch.Tensor

In [43]:
features

0        [tensor(0., dtype=torch.float64), tensor(86., ...
1        [tensor(0., dtype=torch.float64), tensor(28., ...
2        [tensor(0., dtype=torch.float64), tensor(69., ...
3        [tensor(0., dtype=torch.float64), tensor(173.,...
4        [tensor(0., dtype=torch.float64), tensor(69., ...
                               ...                        
10309    [tensor(0., dtype=torch.float64), tensor(64., ...
10310    [tensor(0., dtype=torch.float64), tensor(12., ...
10311    [tensor(0., dtype=torch.float64), tensor(89., ...
10312    [tensor(0., dtype=torch.float64), tensor(53., ...
10313    [tensor(0., dtype=torch.float64), tensor(0., d...
Name: tokenized_message_final, Length: 10314, dtype: object

In [48]:
feat_tensor = torch.vstack(list(features))

In [49]:
feat_tensor

tensor([[0.0000e+00, 8.6000e+01, 6.0000e+00,  ..., 0.0000e+00, 2.0660e+03,
         3.5300e+02],
        [0.0000e+00, 2.8000e+01, 4.0000e+00,  ..., 0.0000e+00, 4.6800e+02,
         1.4200e+02],
        [0.0000e+00, 6.9000e+01, 8.0000e+00,  ..., 1.0000e+00, 1.5110e+03,
         4.8900e+03],
        ...,
        [0.0000e+00, 8.9000e+01, 4.0000e+00,  ..., 0.0000e+00, 1.1700e+03,
         5.3000e+02],
        [0.0000e+00, 5.3000e+01, 2.0000e+00,  ..., 0.0000e+00, 4.5150e+03,
         5.0700e+02],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 1.0000e+00,
         1.0000e+00]], dtype=torch.float64)

In [59]:
feat_np = feat_tensor.numpy()

In [62]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(feat_np, data_['label (depression result)'])

KeyboardInterrupt: 

In [None]:
# pred_y = clf.predict(feat_np)
clf.score(feat_np, data_['label (depression result)'])

In [54]:
feat_tensor.size()[1]

23163

In [50]:

# build custom module for logistic regression
class LogisticRegression(torch.nn.Module):    
    # build the constructor
    def __init__(self, n_inputs, n_outputs):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(n_inputs, n_outputs)
    # make predictions
    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred

In [55]:
log_regr = LogisticRegression(feat_tensor.size()[1], 2)

In [57]:
feat_tensor.dtype

torch.float64

In [58]:
criterion = nn.CrossEntropyLoss()  

NameError: name 'nn' is not defined

In [None]:
learning_rate = 0.001

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

In [56]:
outputs = log_regr(feat_tensor)

RuntimeError: mat1 and mat2 must have the same dtype, but got Double and Float