In [1]:
### Using GingerIt

In [2]:
import numpy as np
import pandas as pd

In [3]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize
from nltk.stem import LancasterStemmer, PorterStemmer
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
import re

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
import plotly
import plotly.tools as tls   
import plotly.graph_objs as go

In [8]:
from gingerit.gingerit import GingerIt


In [9]:
import spacy
from spacy import displacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

In [10]:
import sys
sys.path.append("/Users/Aniket/MyStuff/Study/GreyAtom/Hackathon#3/Hack3_gen_functions")


In [11]:
import Hackathon3GeneralFunctions as genf

In [12]:
def map_iphone4(text):
    iphone4_pat = r"iphone[ ]+4"
    return re.sub(iphone4_pat, 'iPhone4', text, flags=re.I)

In [13]:
def map_ipads(text):
    #1
    ipads_pat = r"ipads"
    return re.sub(ipads_pat, 'ipad', text, flags=re.I)

In [14]:
def map_ipad2(text):
    #2
    ipad2_pat = r"ipad[ ]+2[s]?"
    return re.sub(ipad2_pat, 'ipad2', text, flags=re.I)

In [15]:
def map_brand(text):
    #EDA
    iphone_pat = r"iphone"
    ipad_pat = r"ipad"
    apple_pat = r"apple"
    temp = text
    temp = re.sub(iphone_pat, 'iPhone', temp, flags=re.I)
    temp = re.sub(ipad_pat, 'iPad', temp, flags=re.I)
    temp = re.sub(apple_pat, 'Apple', temp, flags=re.I)
    return temp

In [16]:
def replace_amp(text):
    pat_amp = r"&amp;"
    return re.sub(pat_amp, '&', text, flags=re.I)

In [17]:
def remove_amp(text):
    pat_amp_single = r" & "
    return re.sub(pat_amp_single, '', text, flags=re.I)
    

In [18]:
def remove_amp_op(text):
    pat_amp_single = r" & |& | &"
    return re.sub(pat_amp_single, '', text, flags=re.I)

In [19]:
def lemma(nlp, text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [20]:
def lemmatize_text(text):
    lnlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    return lemma(lnlp, text)

In [21]:
def remove_html_syn(text):    
#     html_syn_pat = r"&lt;[a-z]+&gt;"
    html_syntax_pat = r"&lt;[/a-z]+&gt;"
    return re.sub(html_syntax_pat, ' ', text, flags=re.I)

In [22]:
def replace_text_emoji(text):
    love_pat = r"&lt;3+"
    happy_pat = r":[-]?[\)]+"
    sad_pat = r":[-]?[\(]+"
    playful_pat = r":[-]?[p]+"
    wink_pat = r";[-]?[\)]+"
    straightface_pat = r":[-]?[\|]+"
    
    #replace text emojis
    temp = text
    temp =  re.sub(love_pat, ' love ', temp)
    temp =  re.sub(happy_pat, ' smiley ', temp)
    temp =  re.sub(sad_pat, ' sad ', temp)
    temp =  re.sub(playful_pat, ' playful ', temp, flags=re.I)
    temp =  re.sub(wink_pat, ' wink ', temp)
    temp =  re.sub(straightface_pat, ' straightface ', temp)
    return temp

In [23]:
def refine_before_cleaning(text):
    temp = text
    temp = map_iphone4(temp)
    temp = map_ipads(temp)
    temp = map_ipad2(temp)
    temp = remove_html_syn(temp)
    temp = replace_amp(temp)
    temp = remove_amp(temp)
    temp = replace_text_emoji(temp)
    return temp

In [24]:
# parser = GingerIt()
# line= "I loooveee Apple's new iPad ipad 2 with AT&T/GSM&D deal #ipad2"
# tweet=parser.parse(line)
# tweet

In [25]:
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

sentence = "The striped bats are hanging on their feet for best"

# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp(sentence)

# Extract the lemma for each token and join
" ".join([token.lemma_ for token in doc])
#> 'the strip bat be hang on -PRON- foot for good'

'the stripe bat be hang on -PRON- foot for good'

In [26]:
lemmatize_text("The striped bats are hanging on their feet for best #apple Apple")

'the stripe bat be hang on -PRON- foot for good # apple Apple'

In [27]:
remove_html_syn("&lt;title&gt;This is a title&lt;/title&gt;")

' This is a title '

In [28]:
refine_before_cleaning("&lt;3 ipad 2 iphone 4s &lt; &lt;title&gt; #ipad2")

' love  ipad2 iPhone4s &lt;   #ipad2'

In [29]:
train = pd.read_csv('data/train.csv', encoding='utf-8')
data_gin = train.copy()

In [30]:
# data.loc[0:10,'tweet'].map(refine_before_cleaning)

In [31]:
data_gin['tweet'] = data_gin['tweet'].astype(str)

In [32]:
data_gin['tweet_cleaned'] = data_gin['tweet'].map(refine_before_cleaning)
data_gin['tweet_cleaned'].head()

0    #sxswnui #sxsw #apple defining language of tou...
1    Learning ab Google doodles! All doodles should...
2    one of the most in-your-face ex. of stealing t...
3    This iPhone #SXSW app would b pretty awesome i...
4    Line outside the Apple store in Austin waiting...
Name: tweet_cleaned, dtype: object

In [33]:
data_gin['tweet_cleaned'] = data_gin['tweet_cleaned'].map(lambda x: genf.clean_text(x,lower=False))

In [34]:
data_gin['tweet_cleaned'].head()

0     sxswnui  sxsw  apple defining language of tou...
1    Learning ab Google doodles  All doodles should...
2    one of the most in your face ex  of stealing t...
3    This iPhone  SXSW app would b pretty awesome i...
4    Line outside the Apple store in Austin waiting...
Name: tweet_cleaned, dtype: object

In [35]:
data_gin['tweet_cleaned'] = data_gin['tweet_cleaned'].map(remove_amp_op)

In [None]:
parser = GingerIt()
# line= "I loooveee Apple's new iPad ipad 2 with AT&T/GSM&D deal #ipad2"
# tweet=parser.parse(line)

data_gin['tweet_cleaned_gin'] = data_gin['tweet_cleaned'].map(lambda x: parser.parse(x)['result'])