In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string
from nltk.corpus import stopwords
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
from string import digits
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras import layers
from keras.layers import Activation, Dense , Dropout,Bidirectional, GlobalMaxPool1D , LSTM
from keras.layers import Embedding, LSTM, Dense, Dropout,Conv1D,GlobalMaxPooling1D
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.utils import to_categorical
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from sklearn.metrics import roc_auc_score
from nltk.tokenize import word_tokenize
import gensim
from sklearn.metrics import classification_report,confusion_matrix

Using TensorFlow backend.


In [2]:
forumPosts = pd.read_excel('stanfordMOOCForumPostsSet.xlsx')

In [3]:
forumPosts[:5]

Unnamed: 0,Text,Opinion(1/0),Question(1/0),Answer(1/0),Sentiment(1-7),Confusion(1-7),Urgency(1-7),CourseType,forum_post_id,course_display_name,forum_uid,created_at,post_type,anonymous,anonymous_to_peers,up_count,comment_thread_id,reads
0,Interesting! How often we say those things to ...,1,0,0,6.5,2.0,1.5,Education,5225177f2c501f0a00000015,Education/EDUC115N/How_to_Learn_Math,30CADB93E6DE4711193D7BD05F2AE95C,2013-09-02 22:55:59,Comment,0.0,0.0,0.0,5221a8262cfae31200000001,41.0
1,"What is \Algebra as a Math Game\"" or are you j...",0,1,0,4.0,5.0,3.5,Education,5207d0e9935dfc0e0000005e,Education/EDUC115N/How_to_Learn_Math,37D8FAEE7D0B94B6CFC57D98FD3D0BA5,2013-08-11 17:59:05,Comment,0.0,0.0,0.0,520663839df35b0a00000043,55.0
2,I like the idea of my kids principal who says ...,1,0,0,5.5,3.0,2.5,Education,52052c82d01fec0a00000071,Education/EDUC115N/How_to_Learn_Math,CC11480215042B3EB6E5905EAB13B733,2013-08-09 17:53:06,Comment,0.0,0.0,0.0,51e59415e339d716000001a6,25.0
3,"From their responses, it seems the students re...",1,0,0,6.0,3.0,2.5,Education,5240a45e067ebf1200000008,Education/EDUC115N/How_to_Learn_Math,C717F838D10E8256D7C88B33C43623F1,2013-09-23 20:28:14,CommentThread,0.0,0.0,0.0,,0.0
4,"The boys loved math, because \there is freedom...",1,0,0,7.0,2.0,3.0,Education,5212c5e2dd10251500000062,Education/EDUC115N/How_to_Learn_Math,F83887D68EA48964687C6441782CDD0E,2013-08-20 01:26:58,CommentThread,0.0,0.0,0.0,,3.0


### The below Cell contains all the Functions used in the Notebook which help us in pre processing the Data.

* remove_punct : This function is used to remove all the punctuations from the text
* tokenization : This function is used to split longer strings of data into smaller strings
* load_data : In this fucntion, we are converting the data which we obtained into readable format, in this case into dataframes. We also remove jargon values from the text such as start, end and user.
* stemming : We use stemming to remove the affixes from a word and obtain the root word
* lemmatizer : We use lemmatization to capture canonical forms based on a word's lemma. Eg : better → good
* convert_emojis : We use the convert emojis fucntion to convert the emojis into the their meaning. eg : a sushi emoji will be changed to the word sushi.
* convert_emoticons : We use the convert emojis fucntion to convert the emojis into the their meaning. eg : a happy emoji will be changed to the text happy.
* preprocessing : The preprocessing function is used to preprocess the text. In this function we call the others functions too which will help us in preprocessing the data. We remove punctuations, we remove stop words, emojis and emoticons. We also sem and lemmatize the data. we change the data into lower case.

In [4]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

def tokenization(text):
    text = re.split('\W+', text)
    return text

ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text

# Converting emojis to words
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
    return text

# Converting emoticons to words    
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

def preprocessing(df):    
    df= df.apply(lambda x: convert_emojis(x))
    df= df.apply(lambda x: convert_emoticons(x))
    df = df.apply(lambda x: " ".join(x.lower() for x in x.split()))
    stop = stopwords.words('english')
    df = df.apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    df = df.apply(lambda x: remove_punct(x))
    df = df.apply(lambda x: tokenization(x.lower()))
    df = df.apply(lambda x: stemming(x))
    df = df.apply(lambda x: lemmatizer(x))
    
    for i in range(0, len(df)):
        processed_feature = re.sub(r'\W', ' ', str(df[i]))
        processed_feature = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
        processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
        processed_feature = re.sub(r'^b\s+', '', processed_feature)
        df[i] = processed_feature.lower()
    return df


def int_to_string(sentiment):
    if sentiment in [1,1.5,2,2.5,3]:
        return 'Negative'
    elif sentiment in [3.5,4,4.5]:
        return 'Neutral'
    elif sentiment in [5,5.5,6,6.5,7]:
        return 'Positive'

#### While preprocessing the texts, we were facing issues because all the forumPosts['Text'] values were not in the string format and were in Int and Float format too. 

#### Hence, we converted all the values to the String format and then proceeded with preprocessing 

In [5]:
print("Index of Values which are not string type")
for i in range(0, len(forumPosts)):
    #print(type(forumPosts['Text'][i]))
    
    if type(forumPosts['Text'][i])!=str:
        print(i)

Index of Values which are not string type
11157
18312
19732
23525
24285
27323


In [6]:
for i in range(0, len(forumPosts)):
    if type(forumPosts['Text'][i])!=str:
        forumPosts['Text'][i] = str(forumPosts['Text'][i])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
print("Index of Values which are not string type")
for i in range(0, len(forumPosts)):
    #print(type(forumPosts['Text'][i]))
    if type(forumPosts['Text'][i])!=str:
        print(i)

Index of Values which are not string type


In [8]:
forumPosts["Text"] = preprocessing(forumPosts["Text"])

In [9]:
forumPosts["Sentiments"] = forumPosts["Sentiment(1-7)"].apply(int_to_string)

In [10]:
forumPosts["Sentiments"].unique()

array(['Positive', 'Neutral', 'Negative'], dtype=object)

In [11]:
X = forumPosts.drop('Sentiment(1-7)', axis=1)
y = forumPosts['Sentiments']

In [12]:
# 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=69) 

In [13]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(23683, 18)
(5921, 18)
(23683,)
(5921,)


# Vectorize using TFIDF

In [14]:
tfidf_vect = TfidfVectorizer(max_features=500,min_df=5, max_df=0.8,ngram_range=(1,3))
tfidf_vect.fit(X_train["Text"])
X_train_tfidf = tfidf_vect.transform(X_train["Text"])
X_test_tfidf = tfidf_vect.transform(X_test["Text"])
#y_train_dummy = to_categorical(y_train)
#y_test_dummy = to_categorical(y_test)
y_train_dummy = pd.get_dummies(y_train).values
y_test_dummy = pd.get_dummies(y_test).values

In [15]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train_dummy.shape)
print(y_test_dummy.shape)

(23683, 500)
(5921, 500)
(23683, 3)
(5921, 3)


# Building a CNN Model

* Dense is a standard layer type that is used in many cases for neural networks.
* Relu that is rectified linear activation function returns the value provided as input directly, when training a neural network.
* add function is used to add layers to our model.
* Sequential model is used as the layers are stacked sequentially that is input and output layer with their respective shapes.
* As the output layer is a multiclass classification problem "softmax" has been used as output layer.


In [16]:
vocab_size=5000
maxlen = X_train_tfidf.shape[1]

embedding_size=32
model=Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=maxlen))
model.add(Dropout(0.25))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.25))

model.add(Dense(3, activation='softmax'))

In [17]:
model.compile(loss='categorical_crossentropy', 
             optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 500, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 496, 128)          20608     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1290      
_________________________________________________________________
dropout_2 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                

In [18]:
history = model.fit(X_train_tfidf, y_train_dummy,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test_tfidf, y_test_dummy),
                    batch_size=256)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 23683 samples, validate on 5921 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
from sklearn.metrics import classification_report,confusion_matrix

y_pred_tfidf = model.predict(X_test_tfidf)
y_test_dummy = np.argmax(y_test_dummy, axis=1)
y_pred_tfidf = np.argmax(y_pred_tfidf, axis=1)
print(classification_report(y_test_dummy,y_pred_tfidf))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       354
           1       0.79      1.00      0.88      4662
           2       0.00      0.00      0.00       905

    accuracy                           0.79      5921
   macro avg       0.26      0.33      0.29      5921
weighted avg       0.62      0.79      0.69      5921



  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
pred_prob_tfidf = model.predict_proba(X_test_tfidf)
auc_score = roc_auc_score(y_test_dummy, pred_prob_tfidf, multi_class="ovo",
                                  average="macro")
print('ROC-AUC = %.2f'% auc_score)

ROC-AUC = 0.51


# Word2Vec features

In [21]:
text_lines = list()
texts = forumPosts['Text'].values.tolist()

texts[:5]

[' interest often say thing other without realli understand say must power experi excel ',
 ' algebra math game say creat game incorpor algebra ',
 ' like idea kid princip say smart mean easi smart mean work hard incorpor idea make mistak work hard ',
 ' respons seem student realli like power felt free solv math way want use academ languag like decompos number friendli number abl explain mean ',
 ' boy love math there freedom anyth great way see math number think student realiz math could taught differ thought that learn third grader need know go fourth grade thi inspir happyfaceorsmiley ']

In [22]:
for text in texts:
    tokens=word_tokenize(text)
    tokens=[w.lower() for w in tokens]
    table = str.maketrans('','',string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not  w in stop_words]
    text_lines.append(words)

In [23]:
len(text_lines)

29604

In [24]:
gensim_model = gensim.models.Word2Vec(sentences=text_lines,
                              min_count=1,
                     window=5,
                     size=100, 
                     workers=4)
words = list(gensim_model.wv.vocab)
print('Vocab Size: %d' % len(words))

Vocab Size: 32838


In [25]:
words[:10]

['interest',
 'often',
 'say',
 'thing',
 'without',
 'realli',
 'understand',
 'must',
 'power',
 'experi']

In [26]:
gensim_model.wv.most_similar('interest')

[('intrigu', 0.7789508104324341),
 ('amaz', 0.7689790725708008),
 ('love', 0.7665020227432251),
 ('wonder', 0.7622612714767456),
 ('fascin', 0.7580914497375488),
 ('surpris', 0.7451856136322021),
 ('excit', 0.7426307201385498),
 ('drudgeri', 0.7417196035385132),
 ('enjoy', 0.7308236360549927),
 ('great', 0.727332353591919)]

In [27]:
gensim_model['enjoy']

  """Entry point for launching an IPython kernel.


array([-0.03434306, -1.0740242 , -1.2602619 , -0.9687002 ,  1.0009254 ,
       -1.2584937 ,  1.1704097 , -0.2705046 ,  0.16458653, -1.5313052 ,
        0.6939286 ,  0.24645843, -0.04830007,  0.6405401 , -0.01228523,
       -0.03881168,  0.13040482, -0.00856589,  0.08288328, -0.14945918,
        1.4266909 ,  0.01216085, -0.18554226, -0.59417224,  0.30092084,
        0.5287132 , -0.7288978 , -0.3725744 , -0.20031245,  1.2804173 ,
       -0.3567039 , -0.7394505 ,  0.9295487 , -0.74761343,  0.32853973,
       -1.184182  , -0.99335784,  0.95719326, -0.80714023, -0.07597467,
        0.2040219 ,  0.31531575,  0.70652556,  1.4347022 ,  0.18501073,
        0.99081045,  0.03130284, -1.1582228 ,  0.20032392, -0.84947634,
        0.7981423 ,  0.9516411 ,  0.17432171,  0.70000494,  0.25374693,
        1.3696249 ,  0.7268027 ,  0.5897886 ,  0.43625543,  0.5602741 ,
       -2.3329325 ,  0.47140497,  0.06990305,  1.8073219 ,  0.20314911,
       -0.72885245, -0.43528402,  0.53740215,  1.0941894 , -0.35

## The parameters:

* min_count = int - Ignores all words with total absolute frequency lower than this - (2, 100)
* window = int - The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the left of our target - (2, 10)
* size = int - Dimensionality of the feature vectors. - (50, 300)
* sample = float - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. - (0, 1e-5)
* alpha = float - The initial learning rate - (0.01, 0.05)
* min_alpha = float - Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
* negative = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)
* workers = int - Use these many worker threads to train the model (=faster training with multicore machines)

In [28]:
filename = 'model.txt'
gensim_model.wv.save_word2vec_format(filename,binary=False)

In [29]:
embedding_index = {}

f = open(os.path.join('','model.txt'),encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embedding_index[word] = coefs
f.close

<function TextIOWrapper.close()>

In [30]:
tokenized_object = Tokenizer()
tokenized_object.fit_on_texts(text_lines)
seq = tokenized_object.texts_to_sequences(text_lines)

#pad sequence
word_index = tokenized_object.word_index
print('Unique token %d' % len(word_index))
text_pad = pad_sequences(seq,maxlen=268)
sentiment = forumPosts['Sentiments']
print(text_pad.shape)
print(sentiment.shape)

Unique token 32838
(29604, 268)
(29604,)


In [31]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words,100))

for word,i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [32]:
print(num_words)

32839


In [33]:
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(text_pad,sentiment, test_size=0.2,random_state=69) 

In [34]:
y_train_word2vec_dummy = pd.get_dummies(y_train_word2vec).values
y_test_word2vec_dummy = pd.get_dummies(y_test_word2vec).values

print(y_train_word2vec_dummy.shape)
print(y_test_word2vec_dummy.shape)

(23683, 3)
(5921, 3)


In [35]:
vocab_size=50000
maxlen = X_train_word2vec.shape[1]

embedding_size=64
model_w2v=Sequential()
model_w2v.add(Embedding(vocab_size, embedding_size, input_length=maxlen))
model_w2v.add(Dropout(0.25))
model_w2v.add(Conv1D(128, 5, activation='relu'))
model_w2v.add(GlobalMaxPooling1D())
model_w2v.add(Dense(10, activation='relu'))
model_w2v.add(Dropout(0.25))

model_w2v.add(Dense(3, activation='softmax'))


In [36]:
model_w2v.compile(loss='categorical_crossentropy', 
             optimizer='adam', 
              metrics=['accuracy'])
model_w2v.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 268, 64)           3200000   
_________________________________________________________________
dropout_3 (Dropout)          (None, 268, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 264, 128)          41088     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1290      
_________________________________________________________________
dropout_4 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 3)                

In [37]:
history = model_w2v.fit(X_train_word2vec, y_train_word2vec_dummy,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test_word2vec, y_test_word2vec_dummy),
                    batch_size=512)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 23683 samples, validate on 5921 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [38]:
y_pred_word2vec = model_w2v.predict(X_test_word2vec)
y_test_word2vec_dummy = np.argmax(y_test_word2vec_dummy, axis=1)
y_pred_word2vec = np.argmax(y_pred_word2vec, axis=1)
print(classification_report(y_test_word2vec_dummy,y_pred_word2vec))

              precision    recall  f1-score   support

           0       0.48      0.23      0.31       354
           1       0.87      0.90      0.89      4662
           2       0.59      0.60      0.59       905

    accuracy                           0.82      5921
   macro avg       0.65      0.58      0.60      5921
weighted avg       0.81      0.82      0.81      5921



In [39]:
pred_prob_word2vec = model_w2v.predict_proba(X_test_word2vec)
auc_score_w2v = roc_auc_score(y_test_word2vec_dummy, pred_prob_word2vec, multi_class="ovo",
                                  average="macro")
print('ROC-AUC = %.2f'% auc_score_w2v)

ROC-AUC = 0.83


# From the above 2 models we can see that the model built using TFIDF(acc=0.79, roc-auc= 0.50) has accuracy and roc-auc values lower than the accuracy and roc-auc value of the model built with Word2Vec(acc=0.82, roc-auc= 0.83)