In [169]:
import pandas as pd
import numpy as np
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from string import punctuation
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import nltk
import warnings
warnings.filterwarnings("ignore")

In [170]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [171]:
train.head()

Unnamed: 0,tweet_id,tweet,sentiment
0,1701,#sxswnui #sxsw #apple defining language of tou...,1
1,1851,Learning ab Google doodles! All doodles should...,1
2,2689,one of the most in-your-face ex. of stealing t...,2
3,4525,This iPhone #SXSW app would b pretty awesome i...,0
4,3604,Line outside the Apple store in Austin waiting...,1


In [172]:
test.head()

Unnamed: 0,tweet_id,tweet
0,7506,Audience Q: What prototyping tools do you use?...
1,7992,At SXSW? Send Your Best Photos &amp; Videos to...
2,247,@mention and here's a pic of you winning your...
3,7688,Google Marissa Mayer: mobile phone as a cursor...
4,3294,#SXSW Google maps is even cooler than I thought


In [173]:
def remove_pattern(input_txt, pattern):
#     print(input_txt)
#     print(pattern)
    r = re.findall(pattern, input_txt)
    for x in r:
        input_txt = re.sub(x, '', input_txt)
    return input_txt

def data_cleaning(table):
    #Put everything in lower case
    table['tweet'] = table['tweet'].str.lower()
    #Replace @<some_user>
    table['tweet'] = table['tweet'].apply(lambda row:remove_pattern(row, "@[\w]*"))
    #replace '#'
    table['tweet'] = table['tweet'].apply(lambda row:remove_pattern(row, "#[\w]*"))
    #Replace RT:rt
    table['tweet'] = table['tweet'].apply(lambda row:remove_pattern(row, "rt"))
    #Remove URL's
    table['tweet'] = table['tweet'].apply(lambda row:remove_pattern(row, "r'^https?:\/\/.*[\r\n]*'"))
    #Remove Numbers
    table['tweet'] = table['tweet'].apply(lambda row:remove_pattern(row, "r'[0-9]+'"))
    #Remove Special Characters & Punctuantions
    table['tweet'] = table['tweet'].apply(lambda row:remove_pattern(row, "r'[!$%&()*+,-./:;<=>?@[\]^_`{|}~]'"))
    table['tweet'] = table['tweet'].apply(lambda row:remove_pattern(row, "r'#([^\s]+)', r'\1'"))
    return table


In [174]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import (
    wordnet,
    stopwords
)

[nltk_data] Downloading package wordnet to C:\Users\Savio
[nltk_data]     Coelho\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Savio
[nltk_data]     Coelho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Savio
[nltk_data]     Coelho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Savio Coelho\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [175]:
#remove Null rows if any
def remove_na(table):
    table = table.dropna()
    return table

In [176]:
#Tokenize the data
def data_tokenize(table):
    tokenizer = RegexpTokenizer(r'\w+')
    table['tweet_tokenize'] = table['tweet'].apply(lambda row:tokenizer.tokenize(row))
    return table

In [177]:
from nltk.corpus import stopwords
#Removing Stop Words
def remove_stop_words(table):
    en_stop = set(stopwords.words('english'))
    table['tweet_tokenize'] = table['tweet_tokenize'].apply(lambda x:[word for word in x if not word in en_stop] )
    return table

In [178]:
#Lemmatizing the words
def lemmatize_words(table):
    lemma = WordNetLemmatizer()
    table['tweet_tokenize'] = table['tweet_tokenize'].apply(lambda x: [lemma.lemmatize(word) for word in x])
    return table

In [179]:
def in_dict(word):
    if wordnet.synsets(word):
        #if the word is in the dictionary, we'll return True
        return True

def replace_elongated_word(word):
    regex = r'(\w*)(\w+)\2(\w*)'
    repl = r'\1\2\3'    
    if in_dict(word):
        return word
    new_word = re.sub(regex, repl, word)
    if new_word != word:
        return replace_elongated_word(new_word)
    else:
        return new_word

def detect_elongated_words(row):
    regexrep = r'(\w*)(\w+)(\2)(\w*)'
    words = [''.join(i) for i in re.findall(regexrep, row)]
    for word in words:
        if not in_dict(word):
            row = re.sub(word, replace_elongated_word(word), row)
    return row

In [181]:
train = remove_na(train)
train = data_cleaning(train)
train['tweet'] = train['tweet'].apply(lambda x: detect_elongated_words(x))

In [182]:
train = data_tokenize(train)
train = remove_stop_words(train)
train = lemmatize_words(train)

In [183]:
train.head()

Unnamed: 0,tweet_id,tweet,sentiment,tweet_tokenize
0,1701,defining language of touch with different d...,1,"[defining, language, touch, different, dialect..."
1,1851,learning ab google doodles! all doodles should...,1,"[learning, ab, google, doodle, doodle, light, ..."
2,2689,one of the most in-your-face ex. of stealing t...,2,"[one, face, ex, stealing, show, yr, quot, appl..."
3,4525,this iphone ap would b pretty awesome if it d...,0,"[iphone, ap, would, b, pretty, awesome, crash,..."
4,3604,line outside the apple store in austin waiting...,1,"[line, outside, apple, store, austin, waiting,..."
