In [71]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
pd.set_option('display.max_columns', 1000)

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [72]:
import nltk
import re
from string import punctuation

In [73]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [74]:
train['text'] = train['text'].apply(lambda x: re.sub(r'@[a-zA-Z_]+', '', x))
train['text'] = train['text'].apply(lambda x: re.sub('#', '', x))
train['text'] = train['text'].apply(lambda x: x.lower())
train['text'] = train['text'].apply(lambda x: re.sub(r'\d+', '', x))
train['text'] = train['text'].apply(lambda x: re.sub('http', '', x))
train['text'] = train['text'].apply(lambda x: re.sub('//t.co/.*', '', x))
train['text'] = train['text'].apply(lambda x: re.sub('/+', '', x))
train['text'] = train['text'].apply(lambda x: re.sub('[;.,–-]', '', x))
train['text'] = train['text'].apply(lambda x: re.sub('[.+]', '', x))
train['text'] = train['text'].apply(lambda x: re.sub(chr(0x89) + '.*', '', x))
train['text'] = train['text'].apply(lambda x: re.sub(r'\*+', '', x))
train['text'] = train['text'].apply(lambda x: re.sub(r'\|+', '', x))
train['text'] = train['text'].apply(lambda x: re.sub(r'_', '', x))
train['text'] = train['text'].apply(lambda x: re.sub(r'å¤', '', x))
train['text'] = train['text'].apply(lambda x: re.sub(r'ìü', '', x))

In [75]:
cleared_train = train['text'].copy()
cleared_train

0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to 'shelter in place' are ...
3        people receive wildfires evacuation orders in...
4       just got sent this photo from ruby alaska as s...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609      the out of control wild fires in california ...
7610                   m [: utc]?km s of volcano hawaii :
7611    police investigating after an ebike collided w...
7612    the latest: more homes razed by northern calif...
Name: text, Length: 7613, dtype: object

In [76]:
o_toknized_train = pd.Series([tokenizer.tokenize(sent) for sent in cleared_train])

toknized_train = []

for item in o_toknized_train:
    temp = []
    for word in item:
        temp.append(stemmer.stem(word))
        
    toknized_train.append(temp)

toknized_train

[['our',
  'deed',
  'are',
  'the',
  'reason',
  'of',
  'thi',
  'earthquak',
  'may',
  'allah',
  'forgiv',
  'us',
  'all'],
 ['forest', 'fire', 'near', 'la', 'rong', 'sask', 'canada'],
 ['all',
  'resid',
  'ask',
  'to',
  "'shelter",
  'in',
  'place',
  "'",
  'are',
  'be',
  'notifi',
  'by',
  'offic',
  'no',
  'other',
  'evacu',
  'or',
  'shelter',
  'in',
  'place',
  'order',
  'are',
  'expect'],
 ['peopl', 'receiv', 'wildfir', 'evacu', 'order', 'in', 'california'],
 ['just',
  'got',
  'sent',
  'thi',
  'photo',
  'from',
  'rubi',
  'alaska',
  'as',
  'smoke',
  'from',
  'wildfir',
  'pour',
  'into',
  'a',
  'school'],
 ['rockyfir',
  'updat',
  '=',
  '>',
  'california',
  'hwi',
  'close',
  'in',
  'both',
  'direct',
  'due',
  'to',
  'lake',
  'counti',
  'fire',
  'cafir',
  'wildfir'],
 ['flood',
  'disast',
  'heavi',
  'rain',
  'caus',
  'flash',
  'flood',
  'of',
  'street',
  'in',
  'manit',
  'colorado',
  'spring',
  'area'],
 ['i',
  "'m",


In [77]:
new_tokenized_train = []

for sent in toknized_train:
    temp = []
    for word in sent:
        if word not in punctuation and len(word) < 16 and len(word) > 1:
            temp.append(word)
            
    new_tokenized_train.append(temp)

In [78]:
clear_data = []

for item in new_tokenized_train:
    clear_data.append(' '.join(item))
    
data = pd.DataFrame({'text' : clear_data, 'labels' : train['target']})

In [79]:
data

Unnamed: 0,text,labels
0,our deed are the reason of thi earthquak may a...,1
1,forest fire near la rong sask canada,1
2,all resid ask to 'shelter in place are be noti...,1
3,peopl receiv wildfir evacu order in california,1
4,just got sent thi photo from rubi alaska as sm...,1
...,...,...
7608,two giant crane hold bridg collaps into nearbi...,1
7609,the out of control wild fire in california eve...,1
7610,utc km of volcano hawaii,1
7611,polic investig after an ebik collid with car i...,1


In [80]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

test['text'] = test['text'].apply(lambda x: re.sub(r'@[a-zA-Z_]+', '', x))
test['text'] = test['text'].apply(lambda x: re.sub('#', '', x))
test['text'] = test['text'].apply(lambda x: x.lower())
test['text'] = test['text'].apply(lambda x: re.sub(r'\d+', '', x))
test['text'] = test['text'].apply(lambda x: re.sub('http', '', x))
test['text'] = test['text'].apply(lambda x: re.sub('//t.co/.*', '', x))
test['text'] = test['text'].apply(lambda x: re.sub('/+', '', x))
test['text'] = test['text'].apply(lambda x: re.sub('[;.,–-]', '', x))
test['text'] = test['text'].apply(lambda x: re.sub('[.+]', '', x))
test['text'] = test['text'].apply(lambda x: re.sub(chr(0x89) + '.*', '', x))
test['text'] = test['text'].apply(lambda x: re.sub(r'\*+', '', x))
test['text'] = test['text'].apply(lambda x: re.sub(r'\|+', '', x))
test['text'] = test['text'].apply(lambda x: re.sub(r'_', '', x))
test['text'] = test['text'].apply(lambda x: re.sub(r'å¤', '', x))
test['text'] = test['text'].apply(lambda x: re.sub(r'ìü', '', x))

cleared_test = test['text'].copy()

o_toknized_test = pd.Series([tokenizer.tokenize(sent) for sent in cleared_test])

toknized_test = []

for item in o_toknized_test:
    temp = []
    for word in item:
        temp.append(stemmer.stem(word))

    toknized_test.append(temp)


new_tokenized_test = []

for sent in toknized_test:
    temp = []
    for word in sent:
        if word not in punctuation and len(word) < 16 and len(word) > 1:
            temp.append(word)

    new_tokenized_test.append(temp)

clear_data_test = []

for item in new_tokenized_test:
    clear_data_test.append(' '.join(item))

clear_data_test

In [81]:

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['labels'])

pipe = Pipeline([('vect', CountVectorizer(min_df = 3, stop_words=stop_words)),
                ('tfidf', TfidfTransformer()),
                ('clf', SVC(random_state=1))])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
             'tfidf__use_idf': (True, False),
             'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

grid_s = GridSearchCV(pipe, parameters, n_jobs=-1, verbose=1)

grid_s_fit = grid_s.fit(X_train, y_train)


Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:  1.2min finished


In [82]:
from sklearn.metrics import accuracy_score
predicted = grid_s_fit.predict(X_test)


accuracy_score(y_test, predicted)

0.8040966386554622

In [64]:
from sklearn.linear_model import LogisticRegression

new_pipe = Pipeline([('vect', CountVectorizer(min_df = 3, stop_words=stop_words)),
                ('tfidf', TfidfTransformer()),
                ('NB', LogisticRegression())])

new_parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
             'tfidf__use_idf': (True, False),
             'NB__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

new_grid_s = GridSearchCV(new_pipe, new_parameters, n_jobs=-1, verbose=1)

new_grid_s_fit = new_grid_s.fit(X_train, y_train)

new_predicted = new_grid_s_fit.predict(X_test)

accuracy_score(y_test, new_predicted)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    9.4s finished


0.803046218487395

In [65]:
data

Unnamed: 0,text,labels
0,our deed are the reason of thi earthquak may a...,1
1,forest fire near la rong sask canada,1
2,all resid ask to 'shelter in place are be noti...,1
3,peopl receiv wildfir evacu order in california,1
4,just got sent thi photo from rubi alaska as sm...,1
...,...,...
7608,two giant crane hold bridg collaps into nearbi...,1
7609,the out of control wild fire in california eve...,1
7610,utc km of volcano hawaii,1
7611,polic investig after an ebik collid with car i...,1


In [83]:
target_test = grid_s_fit.predict(clear_data_test)

In [84]:
resulted_data = pd.DataFrame({'id': test['id'], 'target' : target_test})

In [85]:
resulted_data

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [86]:
resulted_data.to_csv('result.csv', index=False)