In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
pd.set_option('display.max_columns', 1000)

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [2]:
import nltk
import re
from string import punctuation

In [3]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
train['text'] = train['text'].apply(lambda x: re.sub(r'@[a-zA-Z_]+', '', x))
train['text'] = train['text'].apply(lambda x: re.sub('#', '', x))
train['text'] = train['text'].apply(lambda x: x.lower())
train['text'] = train['text'].apply(lambda x: re.sub(r'\d+', '', x))
train['text'] = train['text'].apply(lambda x: re.sub('http', '', x))
train['text'] = train['text'].apply(lambda x: re.sub('//t.co/.*', '', x))
train['text'] = train['text'].apply(lambda x: re.sub('/+', '', x))
train['text'] = train['text'].apply(lambda x: re.sub('[;.,–-]', '', x))
train['text'] = train['text'].apply(lambda x: re.sub('[.+]', '', x))
train['text'] = train['text'].apply(lambda x: re.sub(chr(0x89) + '.*', '', x))
train['text'] = train['text'].apply(lambda x: re.sub(r'\*+', '', x))
train['text'] = train['text'].apply(lambda x: re.sub(r'\|+', '', x))
train['text'] = train['text'].apply(lambda x: re.sub(r'_', '', x))
train['text'] = train['text'].apply(lambda x: re.sub(r'å¤', '', x))
train['text'] = train['text'].apply(lambda x: re.sub(r'ìü', '', x))

In [5]:
cleared_train = train['text'].copy()
cleared_train

0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to 'shelter in place' are ...
3        people receive wildfires evacuation orders in...
4       just got sent this photo from ruby alaska as s...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609      the out of control wild fires in california ...
7610                   m [: utc]?km s of volcano hawaii :
7611    police investigating after an ebike collided w...
7612    the latest: more homes razed by northern calif...
Name: text, Length: 7613, dtype: object

In [6]:
toknized_train = pd.Series([tokenizer.tokenize(sent) for sent in cleared_train])

In [7]:
new_tokenized_train = []

for sent in toknized_train:
    temp = []
    for word in sent:
        if word not in punctuation and len(word) < 16 and len(word) > 1:
            temp.append(word)
            
    new_tokenized_train.append(temp)

In [21]:
clear_data = []

for item in new_tokenized_train:
    clear_data.append(' '.join(item))
    
data = pd.DataFrame({'text' : clear_data, 'labels' : train['target']})

In [25]:
data

Unnamed: 0,text,labels
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to 'shelter in place are b...,1
3,people receive wildfires evacuation orders in ...,1
4,just got sent this photo from ruby alaska as s...,1
...,...,...
7608,two giant cranes holding bridge collapse into ...,1
7609,the out of control wild fires in california ev...,1
7610,utc km of volcano hawaii,1
7611,police investigating after an ebike collided w...,1


In [26]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')


In [27]:

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['labels'])

pipe = Pipeline([('vect', CountVectorizer(min_df = 3, stop_words=stop_words)),
                ('tfidf', TfidfTransformer()),
                ('clf', SVC(random_state=1))])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
             'tfidf__use_idf': (True, False),
             'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

grid_s = GridSearchCV(pipe, parameters, n_jobs=-1, verbose=1)

grid_s_fit = grid_s.fit(X_train, y_train)


Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  1.3min finished


In [28]:
from sklearn.metrics import accuracy_score
predicted = grid_s_fit.predict(X_test)

accuracy_score(y_test, predicted)

0.7883403361344538

In [31]:
from sklearn.naive_bayes import MultinomialNB

new_pipe = Pipeline([('vect', CountVectorizer(min_df = 3, stop_words=stop_words)),
                ('tfidf', TfidfTransformer()),
                ('NB', MultinomialNB())])

new_parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
             'tfidf__use_idf': (True, False),
             'NB__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

new_grid_s = GridSearchCV(new_pipe, new_parameters, n_jobs=-1, verbose=1)

new_grid_s_fit = new_grid_s.fit(X_train, y_train)

new_predicted = new_grid_s_fit.predict(X_test)

accuracy_score(y_test, new_predicted)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    6.2s finished


0.7830882352941176

In [32]:
data

Unnamed: 0,text,labels
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to 'shelter in place are b...,1
3,people receive wildfires evacuation orders in ...,1
4,just got sent this photo from ruby alaska as s...,1
...,...,...
7608,two giant cranes holding bridge collapse into ...,1
7609,the out of control wild fires in california ev...,1
7610,utc km of volcano hawaii,1
7611,police investigating after an ebike collided w...,1


In [33]:
target_test = grid_s_fit.predict(test['text'])

In [34]:
resulted_data = pd.DataFrame({'id': test['id'], 'target' : target_test})

In [35]:
resulted_data

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [38]:
resulted_data.to_csv('result.csv', index=False)