In [None]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import random
import re

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [None]:
twt_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/train_super_featured.csv')
twt_data.head()

In [None]:
pattern = re.compile("(?P<url>https?://[^\s]+)")
def remove_link(twt):
    return pattern.sub("r ", twt)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text  import TfidfVectorizer

In [None]:
train, test = train_test_split(twt_data, test_size=0.2)

In [None]:
count_vect = CountVectorizer(stop_words='english')
X_train_counts = count_vect.fit_transform(twt_data['text_super_cleaned'])
X_train_counts.shape

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
count_vect.get_feature_names()

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

In [None]:
text_clf = text_clf.fit(train.text_super_cleaned, train.target)

In [None]:
predicted = text_clf.predict(test.text_super_cleaned)
np.mean(predicted == test.target)

In [None]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                                   alpha=1e-3, random_state=42)),
                        ])

In [None]:
text_clf_svm = text_clf_svm.fit(train.clean_text, train.target_label)

In [None]:
predicted_svm = text_clf_svm.predict(test.clean_text)
np.mean(predicted_svm == test.target_label)

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
             }
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train.clean_text, train.target_label)

In [None]:
gs_clf.best_score_

In [None]:
gs_clf.best_params_

In [None]:
text_clf_improved = Pipeline([('vect', CountVectorizer()),
                              ('clf', MultinomialNB()),
                             ])

In [None]:
text_clf = text_clf_improved.fit(train.clean_text, train.target_label)

In [None]:
predicted = text_clf_improved.predict(test.clean_text)
np.mean(predicted == test.target_label)

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train.clean_text)
tf_idf_train = pd.DataFrame(data = X.toarray(), columns=vectorizer.get_feature_names())
tf_idf_train

In [None]:
test_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/test_super_featured.csv')
test_data.head()

In [None]:
text_clf = text_clf.fit(twt_data.clean_text, twt_data.target_label)

In [None]:
predicted = text_clf.predict(test_data.clean_text)
predicted

In [None]:
test_data[test_data.clean_text != test_data.clean_text]

In [None]:
test_data['text_super_cleaned'].fillna(" ", inplace=True)

In [None]:
test_data[test_data.clean_text != test_data.clean_text]

In [None]:
test_data['target'] = predicted
test_data[['id_original', 'target']].rename(columns={'id_original': 'id'}).to_csv('~/Documents/Datos/DataSets/TP2/res_NB_1.csv', index=False)