In [1]:
import numpy as np
import pandas as pd
import string
import xgboost as xgb
import io
import nltk
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('stopwords')
stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')

from textblob import TextBlob
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\franc\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\franc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\franc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\franc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:

sia = SentimentIntensityAnalyzer()
def return_sia_compound_values(text):
    return sia.polarity_scores(text)['compound']

In [3]:
def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

def contains_punctuation(text):
    punctuation = set(string.punctuation)
    for character in text:
        if character in punctuation:
            return True
    return False

def amount_of_punctuation(text):
    punctuation = set(string.punctuation)
    amount = 0
    for character in text:
        if character in punctuation: amount += 1
    return amount

def get_adjectives(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("JJ")])

def get_nouns(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("NN")])

def get_verbs(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("VB")])

def get_adverbs(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("RB")])

In [4]:
tweets = pd.read_csv("train.csv", usecols=['id','text', 'target'])
test = pd.read_csv("test.csv")

In [5]:
tweets.drop_duplicates(subset = 'text', keep = False, inplace = True)
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7434 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7434 non-null   int64 
 1   text    7434 non-null   object
 2   target  7434 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 232.3+ KB


### Fichur Inginierin


In [6]:
tweets_metrics = tweets[['id','text','target']]
tweets_metrics['text_without_stopwords'] = tweets_metrics['text'].str.split()
tweets_metrics['text_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(remove_stopword)

tweets_metrics['length'] = tweets_metrics['text'].apply(lambda x: len(x))
tweets_metrics['avg_word_length'] = tweets_metrics['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
tweets_metrics['amount_of_words'] = tweets_metrics['text'].str.split().transform(lambda x: len(x))
unique_words_by_tweet = tweets_metrics['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
tweets_metrics['amount_of_unique_words'] = unique_words_by_tweet
tweets_metrics['sentiment'] = tweets_metrics['text'].apply(lambda x: return_sia_compound_values(x))
tweets_metrics['stopwords_count'] = tweets_metrics['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
tweets_metrics['punctuation_count'] = tweets_metrics['text'].apply(lambda x: amount_of_punctuation(x))
mentions = tweets_metrics['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()
tweets_metrics['mentions_count'] = mentions['text'].apply(lambda x: len(x))
hashtags = tweets_metrics['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
tweets_metrics['hashtags_count'] = hashtags.apply(lambda x: len(x))
tweets_metrics['longest_word_length_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)
tweets_metrics['stopword_word_ratio'] = tweets_metrics['stopwords_count'] / tweets_metrics['amount_of_words']

tweets_metrics['adjectives_count'] = tweets_metrics['text'].apply(get_adjectives)
tweets_metrics['nouns_count'] = tweets_metrics['text'].apply(get_nouns)
tweets_metrics['verbs_count'] = tweets_metrics['text'].apply(get_verbs)
tweets_metrics['adverbs_count'] = tweets_metrics['text'].apply(get_adverbs)

tweets_metrics.head()

Unnamed: 0,id,text,target,text_without_stopwords,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,mentions_count,hashtags_count,longest_word_length_without_stopwords,stopword_word_ratio,adjectives_count,nouns_count,verbs_count,adverbs_count
0,1,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds Reason May ALLAH Forgive us,69,4.384615,13,13,0.2732,6,1,0,1,7,0.461538,0,6,1,0
1,4,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Canada,38,4.571429,7,7,-0.34,0,1,0,0,6,0.0,0,6,0,0
2,5,All residents asked to 'shelter in place' are ...,1,All residents asked notified No evacuation she...,133,5.090909,22,20,-0.296,11,3,0,0,10,0.5,1,7,7,0
3,6,"13,000 people receive #wildfires evacuation or...",1,people receive evacuation orders California,65,7.125,8,8,0.0,1,2,0,1,10,0.125,1,4,1,0
4,7,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent photo Ruby smoke pours school,88,4.5,16,15,0.0,7,2,0,2,6,0.4375,0,6,3,1


In [7]:
test_metrics = test[['id','text']]
test_metrics['text_without_stopwords'] = test_metrics['text'].str.split()
test_metrics['text_without_stopwords'] = test_metrics['text_without_stopwords'].apply(remove_stopword)

test_metrics['length'] = test['text'].apply(lambda x: len(x))
test_metrics['avg_word_length'] = test_metrics['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
test_metrics['amount_of_words'] = test_metrics['text'].str.split().transform(lambda x: len(x))
unique_words_by_tweet = test_metrics['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
test_metrics['amount_of_unique_words'] = unique_words_by_tweet
test_metrics['sentiment'] = test_metrics['text'].apply(lambda x: return_sia_compound_values(x))
test_metrics['stopwords_count'] = test_metrics['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
test_metrics['punctuation_count'] = test_metrics['text'].apply(lambda x: amount_of_punctuation(x))
mentions = test_metrics['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()
test_metrics['mentions_count'] = mentions['text'].apply(lambda x: len(x))
hashtags = test_metrics['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
test_metrics['hashtags_count'] = hashtags.apply(lambda x: len(x))
test_metrics['longest_word_length_without_stopwords'] = test_metrics['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)
test_metrics['stopword_word_ratio'] = test_metrics['stopwords_count'] / test_metrics['amount_of_words']

test_metrics['adjectives_count'] = test_metrics['text'].apply(get_adjectives)
test_metrics['nouns_count'] = test_metrics['text'].apply(get_nouns)
test_metrics['verbs_count'] = test_metrics['text'].apply(get_verbs)
test_metrics['adverbs_count'] = test_metrics['text'].apply(get_adverbs)

test_metrics.head()

Unnamed: 0,id,text,text_without_stopwords,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,mentions_count,hashtags_count,longest_word_length_without_stopwords,stopword_word_ratio,adjectives_count,nouns_count,verbs_count,adverbs_count
0,0,Just happened a terrible car crash,Just happened terrible car crash,34,4.833333,6,6,-0.7003,2,0,0,0,8,0.333333,1,2,1,1
1,2,"Heard about #earthquake is different cities, s...",Heard different stay safe,64,6.222222,9,9,0.4404,2,3,0,1,9,0.222222,2,4,2,0
2,3,"there is a forest fire at spot pond, geese are...",forest fire spot geese fleeing across I cannot...,96,4.105263,19,19,-0.6159,9,2,0,0,7,0.473684,2,4,4,1
3,9,Apocalypse lighting. #Spokane #wildfires,Apocalypse,40,9.25,4,4,0.0,0,3,0,2,10,0.0,0,4,0,0
4,11,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills China Taiwan,45,4.75,8,8,-0.5423,2,0,0,0,8,0.25,0,4,1,0


## LSTM

In [8]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')

def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

In [9]:
tweets_metrics['text'] = tweets_metrics['text'].apply(lambda x: x.translate({ord(i): ' ' for i in string.punctuation}))
tweets_metrics['text'] = tweets_metrics['text'].apply(lambda x: x.lower())
tweets_metrics['text'] = tweets_metrics['text'].str.split()
tweets_metrics['text'] = tweets_metrics['text'].apply(remove_stopword)
tweets_metrics['text'] = tweets_metrics['text'].apply(stemm)
tweets_metrics.head()

Unnamed: 0,id,text,target,text_without_stopwords,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,mentions_count,hashtags_count,longest_word_length_without_stopwords,stopword_word_ratio,adjectives_count,nouns_count,verbs_count,adverbs_count
0,1,deed reason earthquak may allah forgiv us,1,Our Deeds Reason May ALLAH Forgive us,69,4.384615,13,13,0.2732,6,1,0,1,7,0.461538,0,6,1,0
1,4,forest fire near la rong sask canada,1,Forest fire near La Ronge Canada,38,4.571429,7,7,-0.34,0,1,0,0,6,0.0,0,6,0,0
2,5,resid ask shelter place notifi offic evacu she...,1,All residents asked notified No evacuation she...,133,5.090909,22,20,-0.296,11,3,0,0,10,0.5,1,7,7,0
3,6,peopl receiv wildfir evacu order california,1,people receive evacuation orders California,65,7.125,8,8,0.0,1,2,0,1,10,0.125,1,4,1,0
4,7,got sent photo rubi alaska smoke wildfir pour ...,1,Just got sent photo Ruby smoke pours school,88,4.5,16,15,0.0,7,2,0,2,6,0.4375,0,6,3,1


In [54]:
tweets_metrics.iloc[:,4:]

Unnamed: 0,length,avg_word_length,amount_of_words,amount_of_unique_words,sentiment,stopwords_count,punctuation_count,mentions_count,hashtags_count,longest_word_length_without_stopwords,stopword_word_ratio,adjectives_count,nouns_count,verbs_count,adverbs_count
0,69,4.384615,13,13,0.2732,6,1,0,1,7,0.461538,0,6,1,0
1,38,4.571429,7,7,-0.3400,0,1,0,0,6,0.000000,0,6,0,0
2,133,5.090909,22,20,-0.2960,11,3,0,0,10,0.500000,1,7,7,0
3,65,7.125000,8,8,0.0000,1,2,0,1,10,0.125000,1,4,1,0
4,88,4.500000,16,15,0.0000,7,2,0,2,6,0.437500,0,6,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7604,136,6.210526,19,19,-0.6841,6,12,0,1,10,0.315789,0,13,3,0
7605,114,3.423077,26,25,-0.4939,16,1,0,0,8,0.615385,2,4,5,3
7606,121,5.100000,20,18,-0.7650,1,11,0,0,8,0.050000,0,14,0,0
7608,83,6.636364,11,11,-0.4939,2,5,0,0,8,0.181818,2,6,1,0


In [24]:
from sklearn.preprocessing import LabelEncoder
X_train = tweets_metrics.text
Y_train = tweets_metrics.target
le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_train = Y_train.reshape(-1,1)

In [25]:
#Comentar para generar submit
X_train,X_test,Y_train,Y_test = train_test_split(X_train,Y_train,test_size=0.25)

In [99]:
max_words = 10000
max_len = 100

In [26]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [27]:
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Bidirectional, Concatenate, Flatten
from keras.models import Model,Sequential
from keras.callbacks import EarlyStopping

In [133]:
#layer = LSTM(256,return_sequences=True)(layer)
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = Bidirectional(LSTM(16,return_sequences=True))(layer)
    layer = Bidirectional(LSTM(4))(layer)
    layer = Dense(64)(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(1)(layer)
    layer = Activation('sigmoid')(layer)

    model = Model(inputs=inputs,outputs=layer)
    return model

In [127]:
model = RNN()
model.summary()

Model: "model_34"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 100)]             0         
_________________________________________________________________
embedding_34 (Embedding)     (None, 100, 50)           500000    
_________________________________________________________________
bidirectional_56 (Bidirectio (None, 100, 1024)         2306048   
_________________________________________________________________
dense_66 (Dense)             (None, 100, 64)           65600     
_________________________________________________________________
activation_62 (Activation)   (None, 100, 64)           0         
_________________________________________________________________
dropout_30 (Dropout)         (None, 100, 64)           0         
_________________________________________________________________
dense_67 (Dense)             (None, 100, 1)            65 

In [None]:
#Sin features
model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])
model.fit(sequences_matrix,Y_train,batch_size=71,epochs=10,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss')])

#Comentar para generar submit - Sin features
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [134]:
#Multiples parametros
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence


X_train = tweets_metrics.iloc[:,4:]
X_train["text"] = tweets_metrics["text"]

Y_train = tweets_metrics.target
le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_train = Y_train.reshape(-1,1)

X_train,X_test,Y_train,Y_test = train_test_split(X_train,Y_train,test_size=0.25)


tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train["text"])

sequences = tok.texts_to_sequences(X_train["text"])
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

test_sequences = tok.texts_to_sequences(X_test["text"])
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)


features = StandardScaler()
X_train_features = features.fit_transform(X_train.iloc[:,:-1])
X_test_features = features.transform(X_test.iloc[:,:-1])
model = RNN()
model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])
model.fit([sequences_matrix,X_train_features],Y_train,batch_size=24,epochs=10,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss')],verbose=1)


accr = model.evaluate([test_sequences_matrix,X_test_features],Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Epoch 1/10
Epoch 2/10
Test set
  Loss: 0.449
  Accuracy: 0.808


## LSTM - TEST.csv (no tiene features)

In [None]:
test_metrics['text'] = test_metrics['text'].apply(lambda x: x.translate({ord(i): ' ' for i in string.punctuation}))
test_metrics['text'] = test_metrics['text'].apply(lambda x: x.lower())
test_metrics['text'] = test_metrics['text'].str.split()
test_metrics['text'] = test_metrics['text'].apply(remove_stopword)
test_metrics['text'] = test_metrics['text'].apply(stemm)
test_metrics.head()

In [None]:
X_test = tweets_test.text
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [None]:
submission = pd.DataFrame()
submission1['id'] = tweets_test['id']
submission1['prob'] = model.predict(test_sequences_matrix)
submission1['target'] = submission1['prob'].apply(lambda x: 0 if x < .5 else 1)
del submission1["prob"]
submission1.head(10)

In [None]:
submission1.to_csv("submit_prueba_7.csv", index=False)

## Logistic Regresion

In [None]:
#BASE PARA TODOS LOS METODOS
from sklearn.linear_model import LogisticRegression
x_train = tweets_metrics.text
y_train = tweets_metrics.target
x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.25)
model = LogisticRegression()

### CountVectorizer

In [None]:
from sklearn.linear_model import LogisticRegression
x_train = tweets_metrics.text
y_train = tweets_metrics.target
x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.25)
model = LogisticRegression()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word')
vectorizer.fit(x_train)
x_train = vectorizer.transform(x_train)
x_test  = vectorizer.transform(x_test)

model.fit(x_train, y_train)
score = model.score(x_test, y_test)

print("Presicion:", score)

### TF-IDF

In [None]:
from sklearn.linear_model import LogisticRegression
x_train = tweets_metrics.text
y_train = tweets_metrics.target
x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.25)
model = LogisticRegression()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

model = LogisticRegression()    
vectorizer = TfidfVectorizer(analyzer="word", smooth_idf = True)
vectorizer.fit(x_train)
x_train = vectorizer.transform(x_train)
x_test  = vectorizer.transform(x_test)

model.fit(x_train, y_train)
score = model.score(x_test, y_test)

print("Presicion:", score)

### Hashing Vectorizer

In [None]:
from sklearn.linear_model import LogisticRegression
x_train = tweets_metrics.text
y_train = tweets_metrics.target
x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.25)
model = LogisticRegression()

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(analyzer="word")
vectorizer.fit(x_train)
x_train = vectorizer.transform(x_train)
x_test  = vectorizer.transform(x_test)

model.fit(x_train, y_train)
score = model.score(x_test, y_test)

print("Presicion:", score)