# Text Preprocessing

In [1]:
import pandas as pd

In [3]:
train = pd.read_csv('train.csv',encoding='iso-8859-1')

In [8]:
train = train.drop('ItemID',axis=1)
train.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


# Data Preparation

### HTML Decoding

In [15]:
from bs4 import BeautifulSoup
example1 = BeautifulSoup(train.SentimentText[100], 'lxml')
print(train.SentimentText[100])
print (example1.get_text())

  no pavel tonight &lt;Tigersfan &gt;
no pavel tonight <Tigersfan >


### Remove‘@’mention

In [16]:
train.SentimentText[200]

" @georgediaz #Magic ..thinking less than 50 % chance Hedo stays in Orlando. He's gonna go for the $$. They all do. Can't blame him though."

In [19]:
import re
re.sub(r'@[A-Za-z0-9]+','',train.SentimentText[200])

"  #Magic ..thinking less than 50 % chance Hedo stays in Orlando. He's gonna go for the $$. They all do. Can't blame him though."

### Remove URL links

In [37]:
train.SentimentText[16]

"    awhhe man.... I'm completely useless rt now. Funny, all I can do is twitter. http://myloc.me/27HX"

In [38]:
re.sub('https?://[A-Za-z0-9./]+','',train.SentimentText[16])

"    awhhe man.... I'm completely useless rt now. Funny, all I can do is twitter. "

### Remove hashtag / numbers / punctuations

In [46]:
train.SentimentText[132]

" #asylm J2 panel is over. Guess it's back to normal life."

In [47]:
re.sub("[^a-zA-Z]", " ", train.SentimentText[132])

'  asylm J  panel is over  Guess it s back to normal life '

### Remove stopwords

In [49]:
from nltk import word_tokenize
stop = set(stopwords.words('english'))
sentence = "this is a foo bar sentence"
print([i for i in sentence.lower().split() if i not in stop])
print([i for i in word_tokenize(sentence.lower()) if i not in stop])

['foo', 'bar', 'sentence']
['foo', 'bar', 'sentence']


### Stemming and Lemmatization

In [50]:
sent = "cats running ran cactus cactuses cacti community communities"
from nltk.stem import PorterStemmer, WordNetLemmatizer
port = PorterStemmer()
wnl = WordNetLemmatizer()
print(" ".join([port.stem(i) for i in sent.split()]))
print(" ".join([wnl.lemmatize(i) for i in sent.split()]))

cat run ran cactu cactus cacti commun commun
cat running ran cactus cactus cactus community community


### Spelling Correction

In [68]:
from textblob import TextBlob
sent = "I fastings in schoole"
str(TextBlob(sent).correct())

'I fasting in school'

# Feature Extraction

### TF-IDF

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['SentimentText'])

### CountVectorizer

In [66]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['SentimentText'])

### Word2Vec

In [70]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.twitter.27B.100d.txt'
word2vec_output_file = 'glove.twitter.27B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(1193514, 100)

In [72]:
from gensim.models import KeyedVectors # load the Stanford GloVe model
filename = 'glove.twitter.27B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [74]:
model['wait']

array([-3.9898e-02,  9.0736e-01,  3.2747e-01,  1.9906e-01, -4.2242e-01,
       -1.1457e-01,  1.0035e+00,  3.0591e-01,  7.1547e-02,  4.0001e-01,
       -9.6573e-02, -2.1317e-01, -4.3520e+00,  3.3282e-01, -4.0430e-01,
       -3.3951e-01,  4.8600e-03,  5.2763e-01, -3.2438e-02,  1.4857e-01,
       -3.8666e-01,  5.0101e-02,  6.3554e-02,  6.7353e-02,  1.5601e-01,
       -1.9567e-01,  1.1034e-01, -4.4742e-01,  1.7990e-01, -6.2470e-01,
       -2.4815e-01,  3.3326e-01, -2.4696e-01,  1.3638e-01, -2.5248e-01,
        5.6508e-01,  2.7245e-01, -4.3211e-03,  5.3012e-01,  4.3394e-01,
       -1.1828e+00, -2.4079e-01,  7.1431e-01, -1.0715e-03,  2.4606e-01,
        3.0575e-01,  1.7527e-01, -7.6920e-01, -1.4829e-01,  7.6699e-01,
       -5.2685e-01,  2.5166e-01,  6.1851e-01,  3.2307e-01,  1.7608e-01,
       -4.4362e-01,  3.4042e-01,  3.7783e-01,  1.8740e-01,  1.8865e-01,
        7.2610e-01,  7.3078e-02,  4.9100e-01, -4.6518e-01,  1.2580e-01,
       -1.5894e-01, -3.1325e-01, -9.4356e-02, -2.1148e-01,  1.29