In [1]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import re
import string
import random
from nltk.tokenize import WordPunctTokenizer
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /home/zshan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/zshan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/zshan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/zshan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

Cleaning the data, do not run it all the time

In [3]:
def extract_csv():
    my_filtered_csv = pd.read_csv('./betterdata.csv', usecols=['SENTIMENT', 'TEXT'])
    return my_filtered_csv

def tokenize_tweets(my_csv):
    tweets = my_csv.TEXT.tolist()
    sentiments = my_csv.SENTIMENT.tolist()
    tokenizer = WordPunctTokenizer() 
    cleaned = []
    for i in range(0, len(tweets)):
        text = tweets[i]
        text = re.sub('^https?://.*[rn]*','', text)
        text = re.sub("(@[A-Za-z0-9_]+)","", text)
        text = re.sub("([^\w\s])", "", text)
        text = tokenizer.tokenize(text)
        element = [text, sentiments[i]]
        cleaned.append(element)
    return cleaned

def lemmatize_sentence(tweet_tokens, stop_words = ()):
    lemmatizer = WordNetLemmatizer()
    cleaned_tokens = []
    for token, tag in pos_tag(tweet_tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('V'):
            pos = 'v'
        else:
            pos = 'a'
        token = lemmatizer.lemmatize(token, pos)
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def create_lemmatized_sent(words):
    cleaned = []
    stop_words = stopwords.words('english')
    for i in range(0, len(words)):
        sent = lemmatize_sentence(words[i][0], stop_words)
        if len(sent) > 0:
            element = [sent, words[i][1]]
            cleaned.append(element)
    return cleaned

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def write_sent(sent):
    cleaned = []
    for i in sent:
        s = ""
        for j in i[0]:
            j = str(j)
            j = j + " "
            s = s + j
        s = remove_emoji(s)
        element = [s, i[1]]
        cleaned.append(element)
    df = pd.DataFrame(cleaned)
    df.to_csv('cleaned_data.csv', index=False)

my_csv = extract_csv()
words = tokenize_tweets(my_csv)
sent = create_lemmatized_sent(words)
write_sent(sent)

In [6]:
df = pd.read_csv('cleaned_data.csv')

# Converting the the columns to usable lists in sklearn
text = df.TEXT.tolist()
sentiment = df.SENTIMENT.tolist()


Fitting and converting the text into vectors which will be later used for training and testing in different models

Using word2vec google

In [9]:
# Constants used:
import word2vec
vec_size = 100

In [10]:
# my_filtered_csv = pd.read_csv('./betterdata.csv', usecols=['SENTIMENT', 'TEXT'])
word2vec.word2phrase('cleaned_data.csv', 'cleaned-phrases', verbose=True)

Running command: word2phrase -train cleaned_data.csv -output cleaned-phrases -min-count 5 -threshold 100 -debug 2
Starting training using file cleaned_data.csv
Words processed: 600K     Vocab size: 387K  
Vocab size (unigrams + bigrams): 219917
Words in train file: 655354


In [11]:
word2vec.word2vec('cleaned-phrases', 'cleaned.bin', size=vec_size, binary=True, verbose=True)

Running command: word2vec -train cleaned-phrases -output cleaned.bin -size 100 -window 5 -sample 1e-3 -hs 0 -negative 5 -threads 12 -iter 5 -min-count 5 -alpha 0.025 -debug 2 -binary 1 -cbow 1
Starting training using file cleaned-phrases
Vocab size: 8275
Words in train file: 663553
Alpha: 0.000185  Progress: 99.56%  Words/thread/sec: 298.34k  

In [12]:
word2vec.word2clusters('cleaned_data.csv', 'cleaned-clusters.txt', 100, verbose=True)

Running command: word2vec -train cleaned_data.csv -output cleaned-clusters.txt -size 100 -window 5 -sample 1e-3 -hs 0 -negative 5 -threads 12 -iter 5 -min-count 5 -alpha 0.025 -debug 2 -binary 0 -cbow 1 -classes 100
Starting training using file cleaned_data.csv
Vocab size: 7971
Words in train file: 671900
Alpha: 0.000237  Progress: 99.53%  Words/thread/sec: 299.19k  

In [13]:
%load_ext autoreload
%autoreload 2

In [14]:
w2v_model = word2vec.load('cleaned.bin')

Averaging the vectors of all the words in the sentance to get a vector for the sentence

In [16]:
cleaned_values = []
cleaned_labels = [] 
for ind_1, sentences in enumerate(text):
    cur_sentence = [0] * vec_size
    num_words = 0
    for word in sentences.split(' '):
        word.strip(' ')
        if len(word) == 0:
            continue
        if word not in w2v_model.vocab:
            continue
        cur_sentence = [a + b for a, b in zip(cur_sentence, w2v_model[word])]
        num_words += 1
    if num_words == 0:
        continue
    for ind, val in enumerate(cur_sentence):
        cur_sentence[ind] = val / num_words
    cur_sentence.append(sentiment[ind_1])
    cleaned_values.append(cur_sentence)
    cleaned_labels.append(sentiment[ind_1])



Normalizing the columns

In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [18]:
df = pd.DataFrame(cleaned_values)

In [19]:
scaled_features = StandardScaler().fit_transform(df.values)
scaled_features_df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)

In [20]:
train_features =df.sample(frac=0.8,random_state=42)
test_features = df.drop(train_features.index)

In [23]:
X_train = train_features[range(0, vec_size)]
Y_train = train_features[vec_size]
X_val = test_features[range(0, vec_size)]
Y_val = test_features[vec_size]

Data has been cleaned and the sentences have been converted into vectors of 100 dimensions. Now we run different models on it

In [24]:
from sklearn.linear_model import LogisticRegression
LR_model = LogisticRegression()
LR_model.fit(X_train, Y_train)

LogisticRegression()

Creating a function which will take input sentence and then output the scale of depression from 0 to 4

In [45]:
def sentence_clean(sentence):
    tokenizer = WordPunctTokenizer() 
    cleaned = []
    sentence = re.sub('^https?://.*[rn]*','', sentence)
    sentence = re.sub("(@[A-Za-z0-9_]+)","", sentence)
    sentence = re.sub("([^\w\s])", "", sentence)
    sentence = tokenizer.tokenize(sentence)
    
    cleaned = []
    # stop_words = stopwords.words('english')
    sent = lemmatize_sentence(sentence)
    result = ''
    if len(sent) > 0:
        result = ' '.join(sent)
    return result



def depression_scale(sentence):
    # cleaning the sentence
    clean_sentence = sentence_clean(sentence)
    word_vector = [0] * vec_size
    num_words = 0
    for word in clean_sentence.split(' '):
        word.strip(' ')
        if len(word) == 0:
            continue
        if word in w2v_model.vocab:
            word_vector = [a + b for a, b in zip(word_vector, w2v_model[word])]
        else:
            continue
        num_words += 1
    if num_words == 0:
        return 2
    
    for ind, val in enumerate(word_vector):
        word_vector[ind] = val / num_words
    y_result_probs = 4 * LR_model.predict_proba([word_vector])[0][1]
    return y_result_probs
    

In [49]:
print(depression_scale("I am so happy that I want to die"))

0.5076293401651547
