In [15]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import random
import re

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [2]:
twt_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/train.csv')

In [5]:
twt_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
pattern = re.compile("(?P<url>https?://[^\s]+)")
def remove_link(twt):
    return pattern.sub("r ", twt)

In [None]:
twt_data['text'] = twt_data['text'].map(remove_link)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [25]:
train, test = train_test_split(twt_data, test_size=0.2)

In [18]:
count_vect = CountVectorizer(stop_words='english')
X_train_counts = count_vect.fit_transform(twt_data.text)
X_train_counts.shape

(7613, 16763)

In [19]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(7613, 16763)

In [20]:
count_vect.get_feature_names()

['00',
 '000',
 '0000',
 '00end',
 '00pm',
 '01',
 '02',
 '0215',
 '02pm',
 '03',
 '030',
 '033',
 '034',
 '039',
 '04',
 '05',
 '05th',
 '06',
 '060',
 '061',
 '06jst',
 '07',
 '08',
 '0840728',
 '0853',
 '087809233445',
 '0880',
 '09',
 '0l',
 '0npzp',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '1000s',
 '1008pla',
 '1008planet',
 '100bn',
 '100mb',
 '100nd',
 '100s',
 '101',
 '1017',
 '1023',
 '1028',
 '103',
 '105',
 '106',
 '1061thetwister',
 '107',
 '109',
 '10am',
 '10k',
 '10km',
 '10m',
 '10news',
 '10pm',
 '10th',
 '10w',
 '10x',
 '11',
 '11000',
 '11000000',
 '111020',
 '114',
 '1141',
 '1145',
 '115',
 '115film',
 '119000',
 '11am',
 '12',
 '1200',
 '12000',
 '120000',
 '1200000',
 '1236',
 '124',
 '125',
 '129',
 '12am',
 '12hr',
 '12jst',
 '12m',
 '12mm',
 '12news',
 '12pm',
 '12th',
 '12u',
 '12v',
 '13',
 '130',
 '13000',
 '133',
 '138',
 '139055',
 '13pm',
 '14',
 '140',
 '14000',
 '14028',
 '141',
 '143',
 '148',
 '149',
 '14th',
 '15',
 '150',
 '1500',
 '150002703

In [26]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

In [27]:
text_clf = text_clf.fit(train.text, train.target)

In [29]:
predicted = text_clf.predict(test.text)
np.mean(predicted == test.target)

0.8003939592908733

In [32]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                                   alpha=1e-3, random_state=42)),
                        ])

In [33]:
text_clf_svm = text_clf_svm.fit(train.text, train.target)

In [34]:
predicted_svm = text_clf_svm.predict(test.text)
np.mean(predicted_svm == test.target)

0.7787261982928431

In [37]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
             }
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train.text, train.target)

In [40]:
gs_clf.best_score_

0.7776683087027915

In [41]:
gs_clf.best_params_

{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

In [74]:
text_clf_improved = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', MultinomialNB()),
                             ])

In [75]:
text_clf = text_clf_improved.fit(train.text, train.target)

In [76]:
predicted = text_clf_improved.predict(test.text)
np.mean(predicted == test.target)

0.8017071569271176