In [1]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import random
import re

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/train_featured.csv')
test_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/test_featured.csv')
test_data['clean_text'].fillna("", inplace=True)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [4]:
train, train_test = train_test_split(train_data, test_size=0.8)

In [5]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

In [6]:
text_clf = text_clf.fit(train.clean_text, train.target_label)

In [7]:
train_predicted = text_clf.predict(train_test.clean_text)
train_predicted_proba = text_clf.predict_proba(train_test.clean_text)
np.mean(train_predicted == train_test.target_label)

0.7716302741750123

In [8]:
test_predicted = text_clf.predict(test_data.clean_text)
test_predicted_proba = text_clf.predict_proba(test_data.clean_text)

In [9]:
train_test['NB_target_proba_not'] = train_predicted_proba.T[0]
test_data['NB_target_proba_not'] = test_predicted_proba.T[0]

In [10]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='log', penalty='l2',
                                                   alpha=1e-3, random_state=42)),
                        ])

In [11]:
text_clf_svm = text_clf_svm.fit(train.clean_text, train.target_label)

In [12]:
train_predicted = text_clf_svm.predict(train_test.clean_text)
train_predicted_proba = text_clf_svm.predict_proba(train_test.clean_text)
np.mean(train_predicted == train_test.target_label)

0.7391232966672139

In [13]:
test_predicted = text_clf_svm.predict(test_data.clean_text)
test_predicted_proba = text_clf_svm.predict_proba(test_data.clean_text)

In [14]:
train_test['SVM_target_proba_not'] = train_predicted_proba.T[0]
test_data['SVM_target_proba_not'] = test_predicted_proba.T[0]

In [15]:
train_test.drop(["id_original","keyword_original","location_original","text_original","hashtags","labels","clean_text"], axis=1, inplace=True)
real_test_data = test_data.drop(["id_original","keyword_original","location_original","text_original","hashtags","labels","clean_text"], axis=1)

In [16]:
train, test = train_test_split(train_test, test_size=0.2)

In [17]:
model_xgb = xgb.XGBClassifier(n_estimators=300, colsample_bytree=0.5, learning_rate=0.1, max_depth=11)
model_xgb.fit(train.drop(['target_label'], axis=1), train.target_label)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=11,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
test_prediction = model_xgb.predict(test.drop(['target_label'], axis=1))
print("Accuracy score: %f" % (accuracy_score(test.target_label, test_prediction)))

Accuracy score: 0.747334


In [19]:
pd.DataFrame(model_xgb.feature_importances_, index=train.drop(['target_label'], axis=1).columns, columns=["importancia"]).\
        sort_values(by="importancia",ascending=False)

Unnamed: 0,importancia
NB_target_proba_not,0.2
SVM_target_proba_not,0.11
links_count,0.11
labels_count,0.08
text_length,0.06
special_chars_count,0.06
mean_word_length,0.06
words_count,0.06
short_words_count,0.06
num_chars_count,0.06


In [None]:
model_xgb = xgb.XGBClassifier(n_estimators=300, colsample_bytree=0.5, learning_rate=0.1, max_depth=11)
model_xgb.fit(train_test.drop(['target_label'], axis=1), train_test.target_label)

In [None]:
real_test_prediction = model_xgb.predict(real_test_data)
test_data['target'] = real_test_prediction
test_data[['id_original', 'target']].rename(columns={'id_original': 'id'}).to_csv('~/Documents/Datos/DataSets/TP2/res_XGB_1.csv', index=False)