In [1]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import random
import re

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/train_featured.csv')
test_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/test_featured.csv')
test_data['clean_text'].fillna("", inplace=True)

In [3]:
hashtag_dic = {}
for ls in train_data.hashtags.str.split():
    try:
        for hashtag in ls:
            try:
                hashtag_dic[hashtag] += 1
            except KeyError:
                hashtag_dic[hashtag] = 1
    except TypeError:
        pass

In [4]:
hashtag_dic = {k: v for k, v in sorted(hashtag_dic.items(), key=lambda item: item[1], reverse=True)}
hashtag_dic = {k: v for k, v in hashtag_dic.items() if v >= 10}
hashtag_dic

{'#news': 65,
 '#hot': 30,
 '#prebreak': 30,
 '#best': 30,
 '#???': 23,
 '#nowplaying': 21,
 '#hiroshima': 21,
 '#earthquake': 19,
 '#??': 19,
 '#gbbo': 17,
 '#jobs': 14,
 '#world': 11,
 '#islam': 11,
 '#japan': 10,
 '#job': 10,
 '#india': 10,
 '#sismo': 10}

In [5]:
for x in hashtag_dic.keys():
    train_data[x] = 0
    test_data[x] = 0

In [6]:
def one_hot_hashtags(row):
    try:
        for x in row.hashtags.split():
            if x in row:
                row[x] = 1
    except AttributeError:
        pass
    return row

In [7]:
train_data = train_data.apply(one_hot_hashtags, axis=1)
train_data.head()

Unnamed: 0,id_original,keyword_original,location_original,text_original,target_label,special_chars_count,hashtags,labels,hashtags_count,labels_count,...,#earthquake,#??,#gbbo,#jobs,#world,#islam,#japan,#job,#india,#sismo
0,1,,,our deeds are the reason of this earthquake ma...,1,1,#earthquake,,1,0,...,1,0,0,0,0,0,0,0,0,0
1,4,,,forest fire near la ronge sask canada,1,1,,,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,,,all residents asked to shelter in place are be...,1,3,,,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,,,people receive wildfires evacuation orders in ...,1,2,#wildfires,,1,0,...,0,0,0,0,0,0,0,0,0,0
4,7,,,just got sent this photo from ruby alaska as s...,1,2,#alaska #wildfires,,2,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
test_data = test_data.apply(one_hot_hashtags, axis=1)
test_data.head()

Unnamed: 0,id_original,keyword_original,location_original,text_original,special_chars_count,hashtags,labels,hashtags_count,labels_count,num_chars_count,...,#earthquake,#??,#gbbo,#jobs,#world,#islam,#japan,#job,#india,#sismo
0,0,,,just happened terrible car crash,0,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,,,heard about earthquake is different cities sta...,3,#earthquake,,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,,,there is forest fire at spot pond geese are f...,2,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,,,apocalypse lighting spokane wildfires,3,#spokane #wildfires,,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11,,,typhoon soudelor kills in china and taiwan,0,,,0,0,2,...,0,0,0,0,0,0,0,0,0,0


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder

In [10]:
train, train_test = train_test_split(train_data, test_size=0.5)

In [11]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

In [12]:
text_clf = text_clf.fit(train.clean_text, train.target_label)

In [13]:
train_predicted = text_clf.predict(train_test.clean_text)
train_predicted_proba = text_clf.predict_proba(train_test.clean_text)
np.mean(train_predicted == train_test.target_label)

0.789598108747045

In [14]:
test_predicted = text_clf.predict(test_data.clean_text)
test_predicted_proba = text_clf.predict_proba(test_data.clean_text)

In [15]:
train_test['NB_target_proba_not'] = train_predicted_proba.T[0]
test_data['NB_target_proba_not'] = test_predicted_proba.T[0]

In [16]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='log', penalty='l2',
                                                   alpha=1e-3, random_state=42)),
                        ])

In [17]:
text_clf_svm = text_clf_svm.fit(train.clean_text, train.target_label)

In [18]:
train_predicted = text_clf_svm.predict(train_test.clean_text)
train_predicted_proba = text_clf_svm.predict_proba(train_test.clean_text)
np.mean(train_predicted == train_test.target_label)

0.7617546624638823

In [19]:
test_predicted = text_clf_svm.predict(test_data.clean_text)
test_predicted_proba = text_clf_svm.predict_proba(test_data.clean_text)

In [20]:
train_test['SVM_target_proba_not'] = train_predicted_proba.T[0]
test_data['SVM_target_proba_not'] = test_predicted_proba.T[0]

In [21]:
train_test.drop(["id_original","keyword_original","location_original","text_original","hashtags","labels","clean_text"], axis=1, inplace=True)
real_test_data = test_data.drop(["id_original","keyword_original","location_original","text_original","hashtags","labels","clean_text"], axis=1)

In [22]:
train, test = train_test_split(train_test, test_size=0.2)

In [23]:
model_xgb = xgb.XGBClassifier(n_estimators=300, colsample_bytree=0.5, learning_rate=0.1, max_depth=11)
model_xgb.fit(train.drop(['target_label'], axis=1), train.target_label)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=11,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
test_prediction = model_xgb.predict(test.drop(['target_label'], axis=1))
print("Accuracy score: %f" % (accuracy_score(test.target_label, test_prediction)))

Accuracy score: 0.774278


In [25]:
pd.DataFrame(model_xgb.feature_importances_, index=train.drop(['target_label'], axis=1).columns, columns=["importancia"]).\
        sort_values(by="importancia",ascending=False)

Unnamed: 0,importancia
#jobs,0.14
SVM_target_proba_not,0.12
NB_target_proba_not,0.11
labels_count,0.05
links_count,0.05
mean_word_length,0.05
#hiroshima,0.05
#best,0.05
vowels_count,0.04
words_count,0.04


In [26]:
model_xgb = xgb.XGBClassifier(n_estimators=300, colsample_bytree=0.5, learning_rate=0.1, max_depth=11)
model_xgb.fit(train_test.drop(['target_label'], axis=1), train_test.target_label)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=11,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
real_test_prediction = model_xgb.predict(real_test_data)
test_data['target'] = real_test_prediction
test_data[['id_original', 'target']].rename(columns={'id_original': 'id'}).to_csv('~/Documents/Datos/DataSets/TP2/res_XGB_1.csv', index=False)