# Pre-processing in Natural Language Processing projects

In [1]:
import re
import nltk
import string
import contractions

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence


Using TensorFlow backend.


We start by loading the dataset and observing the format of the posts and the type. 

In [2]:
data = pd.read_csv("mbti_1.csv")
types = np.unique(np.array(data['type']))
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


We can now look at one sample in greater detail. Let's look at the case for user 4 and their post history in greater detail. 

In [3]:
data.posts[3]

"'Dear INTP,   I enjoyed our conversation the other day.  Esoteric gabbing about the nature of the universe and the idea that every rule and social code being arbitrary constructs created...|||Dear ENTJ sub,   Long time no see.  Sincerely, Alpha|||None of them. All other types hurt in deep existential ways that I want no part of.|||Probably a sliding scale that depends on individual preferences, like everything in humanity.|||Draco Malfoy also. I'd say he's either 358 or 368.|||I'm either 358 or 385, though in which stacking to me is a somewhat arbitrary distinction to make as I believe that the core indicates primary motivation and has a hand in every action. Therefore, a...|||I'm not particularly introverted or extraverted, personally. That said, I would say I'm somewhat unphased by either social interactions or being alone. What I'd say I crave more so than anything is...|||Dear Type 9 INFP,  Your absolute admiration of me is refreshing. You're a great girlfriend and I wish we both 

# Data Cleaning

We first begin to clean the data by removing the stop words from the data along with any information about the links that the users post. In a practical application, instead of using separate Lambda functions for each task, it would be more convenient to use one loop to perform all these tasks together. However, For this demonstration, we perform each task separately to demonstrate the results after every iteration. 

In [4]:
#We start off by removing all the hyperlinks 
data['posts'] = data['posts'].str.replace('http\S+|www.\S+', '', case=False)

In [5]:
#We then remove all the symbols that we don't need and which won't contribute to anything meaningful. However, we keep the ' to fix the contractions
data['posts'] = data['posts'].apply(lambda x: ' '.join([re.sub(r"[^A-Za-z0-9(),!?\'\`\.]", "", word) for word in x.split()]))

In [6]:
#Removing word contractions 
data['posts']= data['posts'].apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))
data.posts[3]

" wouldear INTP, I enjoyed our conversation the other day. Esoteric gabbing about the nature of the universe and the idea that every rule and social code being arbitrary constructs created...Dear ENTJ sub, Long time no see. Sincerely, AlphaNone of them. All other types hurt in deep existential ways that I want no part of.Probably a sliding scale that depends on individual preferences, like everything in humanity.Draco Malfoy also. I would say he is either 358 or 368.I am either 358 or 385, though in which stacking to me is a somewhat arbitrary distinction to make as I believe that the core indicates primary motivation and has a hand in every action. Therefore, a...I am not particularly introverted or extraverted, personally. That said, I would say I am somewhat unphased by either social interactions or being alone. What I would say I crave more so than anything is...Dear Type 9 INFP, Your absolute admiration of me is refreshing. you are a great girlfriend and I wish we both did not hav

In [7]:
#Changing all the words to lower-case 
data['posts'] = data['posts'].apply(lambda x: ' '.join([word.lower() for word in x.split()]))

In [8]:
#Removing all the stop-words (As described in the Readme)
stop = stopwords.words('english')
data['posts'] = data['posts'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [9]:
#Changing all the words into the root-word
lemmatizer = nltk.stem.WordNetLemmatizer()
data['posts'] = data['posts'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word, 'v') for word in x.split()]))
data.posts[3] 

"wouldear intp, enjoy conversation day. esoteric gabbing nature universe idea every rule social code arbitrary construct created...dear entj sub, long time see. sincerely, alphanone them. type hurt deep existential ways want part of.probably slide scale depend individual preferences, like everything humanity.draco malfoy also. would say either 358 368.i either 358 385, though stack somewhat arbitrary distinction make believe core indicate primary motivation hand every action. therefore, a...i particularly introvert extraverted, personally. said, would say somewhat unphased either social interactions alone. would say crave anything is...dear type 9 infp, absolute admiration refreshing. great girlfriend wish busy schedule could around one another often. keep...2 still mean 150 people. probably see 12 others today. never understand fascination virtue rarity.so, esfj train also, right?i toy idea op extrovert also awhile now, actually. many conversations him, however disincline believe due 

In [10]:
data.to_csv('preprocessed.csv')

# Data Preprocessing

This step involves tokenizing the words and padding 

In [11]:
df = pd.read_csv("preprocessed.csv")

In [19]:
Tokenize = df['posts']
Tokenize = Tokenize.str.replace('\d+','')


In [14]:
max_features = 3000
max_review_length = 500

tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(Tokenize.values)
x_train = tokenizer.texts_to_sequences(Tokenize.values)
X_train = sequence.pad_sequences(x_train, maxlen=max_review_length)

In [17]:
X_train[3]

array([ 583, 1091,  173,  133,  672,  217, 1832, 2742,  500,  247, 2091,
         99,   13,   16, 2015, 2659,  959,   97,   18,  385,  473,  476,
         15,  148,  165,   82, 1638,  286,  926, 2044,    1,  141, 1678,
         29,    3,   12,  191,  191,   46, 2016,  741,   11,  121, 1191,
       2045, 1521, 1421,  369,  133,  616, 1161,  123,  755,  266, 1280,
        310,  339,    3,   12,  741,  191,  217, 1833,  261,    3,   12,
       2097,   93,   87,  500,   18,   59, 1564,  132,  907,  256, 1065,
       1751,   33,   84,    7,  163,  147,  124,   66,   45,    5,   82,
         16,  119,  283,   42,   73,  143,  336, 1046,   29,   55, 2479,
        173,  700, 1055,   29, 1486,  140,   52,   71,  831,  275,  210,
        121,  550,  700,   20,  213,  204,   66,  336,  839, 2713,  336,
        201,  412,   78, 1271,  433,  462, 1056,  316,   10,  935,  543,
       1861,    1, 1928,  194, 1129,   49,   20, 1391,   57,  120,   75,
          4,  324,   86,  537,   85, 2430, 1296,   