In [357]:
import pandas as pd
import numpy as np
from scipy import sparse

from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn import datasets
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn import metrics

In [358]:
train = pd.read_csv("csv_lio/train_clean.csv")
test  = pd.read_csv("csv_lio/test_clean.csv")
train = train.drop(columns = 'Unnamed: 0')
test  = test.drop(columns = 'Unnamed: 0')

## Split of the train set

In [377]:
X = train.text.values.astype('U')
y = train.target.values.astype('U')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

## Extracting features from text files

In [378]:
# BOW is created (key:value) - word occurrance
# "key" is a specific word
# The index value of a word in the vocabulary is linked to its frequency in the whole training corpus.
count_vect = CountVectorizer(analyzer='word',binary=True)
X_train_counts = count_vect.fit_transform(X_train) #.values.astype('U'))
X_train_counts.shape

(6090, 16309)

In [379]:
# TF-IDF
tfidf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(6090, 16309)

## Training a classifier - no pipeline

In [380]:
# Multinomial Naive Bayes
# y_train :  es la columna target
# X_train_tfidf : es el campo "text" entrenado
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [381]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

predicted = clf.predict(X_test_tfidf)


In [382]:
print(len(predicted))

1523


In [383]:
# imprimimos las predicciones para el X_test
for tweet, category in zip(X_test, predicted):
    print('%r => %s' % (tweet, y_train))

'   fouseytube -PRON- ambulance hahahah zsberqnn' => ['0' '0' '0' ... '1' '1' '0']
'   jusstdoitgirl -PRON- shit tryna trouble wbu homie' => ['0' '0' '0' ... '1' '1' '0']
'   bovine rescue smuggler nab fn ruu' => ['0' '0' '0' ... '1' '1' '0']
' storm fuck -PRON- -PRON- cool' => ['0' '0' '0' ... '1' '1' '0']
'   abc parent colorado theater shoot victim fear copycat massacre lvlh awo antioch viwxy xdyk' => ['0' '0' '0' ... '1' '1' '0']
' join providence health team -PRON- nursing hzemldpu torrance hire' => ['0' '0' '0' ... '1' '1' '0']
' -PRON- engulf image quiz cimubweyib fst bd' => ['0' '0' '0' ... '1' '1' '0']
' loner diarie pattern sand blow photo choke flame ekfaz wvbz' => ['0' '0' '0' ... '1' '1' '0']
' drive dust storm sruj zljgl' => ['0' '0' '0' ... '1' '1' '0']
' emergency unit simulate chemical explosion nu evanston kfyebhb' => ['0' '0' '0' ... '1' '1' '0']
' fire truck parking' => ['0' '0' '0' ... '1' '1' '0']
' ra message condolence vietnam natural disaster behalf cuban gover

'   friggin wreck destiel suck vine description mkx ux ozt' => ['0' '0' '0' ... '1' '1' '0']
'   reason -PRON- love woman military lulgzimbestpict iapvtqxlht wamkre ckd' => ['0' '0' '0' ... '1' '1' '0']
'   fatality mileena -PRON- sad' => ['0' '0' '0' ... '1' '1' '0']
'   action ferguson upheaval tbqsqtmqv' => ['0' '0' '0' ... '1' '1' '0']
' -PRON- mlg food collide ijustine salmon alaskaseafood askforalaska snyghaivs' => ['0' '0' '0' ... '1' '1' '0']
' providence health emergency supervisor emergency kodiak ak aqcsusqbdy healthcare' => ['0' '0' '0' ... '1' '1' '0']
'   fedex transport bioterror germ wake anthrax lab mishap sicmyrmh' => ['0' '0' '0' ... '1' '1' '0']
' dq cotton candy blizzard' => ['0' '0' '0' ... '1' '1' '0']
' cancel -PRON- cat doctor appointment -PRON- decide curfew' => ['0' '0' '0' ... '1' '1' '0']
' deputies shoot brighton ablaze gwnrhmso' => ['0' '0' '0' ... '1' '1' '0']
' nigeria rainstorm destroy house yobe daily trust damaturu bbqnk qus' => ['0' '0' '0' ... '1' 

' dress meme officially explode internet drsmxw cr' => ['0' '0' '0' ... '1' '1' '0']
'   dylanmcclure zumiez zw jp location' => ['0' '0' '0' ... '1' '1' '0']
' obama declare disaster typhoon devastate saipan president barack obama declare commonwealth olzv bv' => ['0' '0' '0' ... '1' '1' '0']
' zouma absolutely flatten' => ['0' '0' '0' ... '1' '1' '0']
' -PRON- engulfed image quiz yzdmouxqbo pexfgawrg' => ['0' '0' '0' ... '1' '1' '0']
' walmart step child safe hot vehicle innovative seat nevglufm' => ['0' '0' '0' ... '1' '1' '0']
'   zourryart forget add burn building scream baby' => ['0' '0' '0' ... '1' '1' '0']
'   xlfi ovhfj -PRON- pick body water rescuer search raq zpdvke' => ['0' '0' '0' ... '1' '1' '0']
'   originalfunko spencer thunder buddys thunder buddys' => ['0' '0' '0' ... '1' '1' '0']
' poor jack' => ['0' '0' '0' ... '1' '1' '0']
' yup sink spvp' => ['0' '0' '0' ... '1' '1' '0']
' -PRON- careless match forest fire -PRON- box campfire' => ['0' '0' '0' ... '1' '1' '0']
' pak

' islamic egypt threaten kill croat hostage nzifztcugl' => ['0' '0' '0' ... '1' '1' '0']
' crush -PRON- ewnunp hdo' => ['0' '0' '0' ... '1' '1' '0']
' survive market lnzi oil invest money trade forex gold silver business tmpfwjpi' => ['0' '0' '0' ... '1' '1' '0']
' coastal german shepherd rescue oc share link qwnglkos animalrescue idc ubj' => ['0' '0' '0' ... '1' '1' '0']
'   gas explode scream street smell gas' => ['0' '0' '0' ... '1' '1' '0']
' drunk meal cook -PRON- totally obliterated qvs bg' => ['0' '0' '0' ... '1' '1' '0']
' migrant drown sea boat capsize libya pv nroov pseylyzck' => ['0' '0' '0' ... '1' '1' '0']
' credit pfannebecker inspire -PRON- rediscover fantabulous tbt wmhy xkil' => ['0' '0' '0' ... '1' '1' '0']
'   irishspy -PRON- ally suck casualty' => ['0' '0' '0' ... '1' '1' '0']
'   bigburgerboi flat football -PRON- flatten spartan crush -PRON- hail' => ['0' '0' '0' ... '1' '1' '0']
' wreckage conclusively confirm mh malaysia investigator family ebpybfh' => ['0' '0' '

## Same, but using Pipeline

In [384]:
# Many fitters can be passed, but the last parameter MUST BE a classifier
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer(analyzer='word',binary=True)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=1)),
])

In [385]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=True,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1, class_prior=None, fit_prior=True))],
         verbose=False)

## PERFORMANCE

### With MultinomialNaiveBayes

In [386]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [389]:
docs_test = X_test
predicted_MNB = text_clf.predict(docs_test)
metrics.accuracy_score(y_test,predicted_MNB)

0.8102429415627052

### With SDG Classifier

In [398]:
# SDG Classifier, no TF-IDF
# GridSearch states that TF-IDF=False is better
text_clf = Pipeline([
    ('vect', CountVectorizer(analyzer='word',binary=True)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=0.01, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(X_train, y_train)
docs_test = X_test
predicted_SGDC = text_clf.predict(docs_test)
metrics.accuracy_score(y_test,predicted_SGDC)

0.7741300065659882

### With Logistic Regression

In [417]:
# LR Classifier, no TF-IDF
# GridSearch states that TF-IDF=False is better
text_clf = Pipeline([
    ('vect', CountVectorizer(analyzer='word',binary=True)),
    ('clf', LogisticRegression()),
])

text_clf.fit(X_train, y_train)
docs_test = X_test
predicted_LR = text_clf.predict(docs_test)
metrics.accuracy_score(y_test,predicted_LR)

0.8187787261982928

## ALL Classifiers: Scores

In [418]:
def runClassifiers(X_train, y_train, X_test):
    classifiers =[LogisticRegression(), MultinomialNB(),DecisionTreeClassifier(),
        KNeighborsClassifier(),RandomForestClassifier(), SGDClassifier()]
    classifier_names = ['Logistic Regression','MultinomialNB','Decision-Tree Classfier','K-Neighbors Classifier',
                       'Random-Forest Classifier','SGDC Classifier']
    i=-1
    for classifier in classifiers:
        i = i + 1
        text_clf = Pipeline([
        ('vect', CountVectorizer(analyzer='word',binary=True)),#('tfidf', TfidfTransformer()),
        ('actualClassifier',classifier )])
        text_clf.fit(X_train, y_train)
        docs_test = X_test
        predicted = text_clf.predict(docs_test)
        print(classifier_names[i],np.mean(predicted == y_test))

In [419]:
runClassifiers(X_train, y_train, X_test)

Logistic Regression 0.8187787261982928
MultinomialNB 0.8049901510177282
Decision-Tree Classfier 0.7636244254760342
K-Neighbors Classifier 0.6913985554826001
Random-Forest Classifier 0.7898883782009193
SGDC Classifier 0.7964543663821405


# GRID SEARCH

In [297]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2),(1, 3)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

### SGDClassifier

In [298]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

In [293]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)


In [294]:
gs_clf = gs_clf.fit(X_train, y_train)

In [299]:
print('Best Score: ',gs_clf.best_score_)

Best Score:  0.7689655172413794


In [302]:
print('SGDC Classifier')
for param_name in sorted(parameters.keys()):
    print("   %s: %r" % (param_name, gs_clf.best_params_[param_name]))

SGDC Classifier
   clf__alpha: 0.01
   tfidf__use_idf: False
   vect__ngram_range: (1, 2)


# LOGISTIC REGRESSION - ON TOTAL TRAIN AND REAL TEST

In [421]:
X = train.text.values.astype('U')
y = train.target.values.astype('U')

In [422]:
# LR Classifier, no TF-IDF
# GridSearch states that TF-IDF=False is better
text_clf = Pipeline([
    ('vect', CountVectorizer(analyzer='word',binary=True)),
    ('clf', LogisticRegression()),
])

In [423]:
text_clf.fit(X, y)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=True,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                           

In [425]:
test  = pd.read_csv("csv_lio/test_clean.csv")
test  = test.drop(columns = 'Unnamed: 0')

In [426]:
predicted_LR = text_clf.predict(test.text.values.astype('U'))
predicted_LR

array(['0', '1', '1', ..., '1', '1', '0'], dtype='<U21')

In [427]:
sample_submission = pd.read_csv('original_data/sample_submission.csv')

In [428]:
sample_submission.target = predicted_LR
sample_submission = sample_submission.set_index('id')

In [430]:
sample_submission.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,1
3,1
9,1
11,1


In [431]:
sample_submission.to_csv('pepito_cibrian_reloaded.csv')