# Main Code


In [1]:
import os
import re

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from gensim.models import KeyedVectors,Word2Vec
from nltk.stem.snowball import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
STOPWORDS = stopwords.words("english")
STEMMER = SnowballStemmer("english")

DATA_DIR_PATH = 'data'
DATA_FILE_PATH = os.path.join(DATA_DIR_PATH, 'training.1600000.processed.noemoticon.csv')
DATA = pd.read_csv(DATA_FILE_PATH, encoding = "ISO-8859-1", names = ["target", "ids", "data", 'flag', "user", "text"])

WORD2VEC_PATH = os.path.join(DATA_DIR_PATH,'GoogleNews-vectors-negative300.bin.gz' )
word2vec = KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)

In [3]:
target_encoding = {0: "neg", 2: 'neu', 4: 'pos'}


x_raw = DATA.text
y_raw = DATA.target

In [4]:
y_raw.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [None]:
x_raw.head()

0    @switchfoot http://twitpic.com/2y1zl - Awww, t...
1    is upset that he can't update his Facebook by ...
2    @Kenichan I dived many times for the ball. Man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
Name: text, dtype: object

In [None]:
def cleaning_sentence(text):
    text = re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    return text


def removing_stop_words(text):
    words = text.split()
    
    res = []
    for word in words:
        if not word in STOPWORDS:
            res.append(word)
    return ' '.join(res)


def stemming_words(text):
    words = text.split()
    res = []
    for word in words:
        res.append(STEMMER.stem(word))
    return ' '.join(res) 


def text_pre_process(text):
    cleaned = cleaning_sentence(text)
    removed = removing_stop_words(cleaned)
    stemmed = stemming_words(removed)
    return stemmed


def clean(x):
    x_clean = x.apply(lambda item: text_pre_process(item))
    return x_clean

In [None]:
X_MAIN = clean(x_raw)

In [None]:
Y_MAIN = y_raw

In [None]:
X_MAIN.head()

0         awww bummer shoulda got david carr third day
1    upset updat facebook text might cri result sch...
2      dive mani time ball manag save 50 rest go bound
3                      whole bodi feel itchi like fire
4                                        behav mad see
Name: text, dtype: object

In [None]:
Y_MAIN.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [None]:
%%time
documents = [_text.split() for _text in X_MAIN]
x_train, x_test, y_train, y_test = train_test_split(X_MAIN, Y_MAIN, test_size = 0.3, random_state = 42)

CPU times: user 1.17 s, sys: 164 ms, total: 1.33 s
Wall time: 1.33 s


In [None]:
len_of_seq = 300

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

x_train = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=len_of_seq)
x_test = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=len_of_seq)

In [None]:
y_train = y_train.values.reshape(-1,1)
y_train.shape

In [None]:
y_test = y_test.values.reshape(-1,1)
y_test.shape

In [None]:
y_train[:10]

In [None]:
y_test[:10]