In [35]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import lightgbm as lgb
import re
import unidecode
import nltk
from nltk.tokenize import TweetTokenizer
import string

stopwords = nltk.corpus.stopwords.words('english')

import warnings
warnings.filterwarnings('ignore')

In [17]:
train = pd.read_csv("../data/train_binary.csv")
test = pd.read_csv("../data/test_binary.csv")

In [18]:
train.head()

Unnamed: 0,id_original,keyword_original,location_original,text_original,target_label,special_chars_count,hashtags,labels,hashtags_count,labels_count,...,words_count,keyword_original_0,keyword_original_1,keyword_original_2,keyword_original_3,keyword_original_4,keyword_original_5,keyword_original_6,keyword_original_7,keyword_original_8
0,1,,,our deeds are the reason of this earthquake ma...,1,1,#earthquake,,1,0,...,13,0,0,0,0,0,0,0,0,1
1,4,,,forest fire near la ronge sask canada,1,1,,,0,0,...,7,0,0,0,0,0,0,0,0,1
2,5,,,all residents asked to shelter in place are be...,1,3,,,0,0,...,22,0,0,0,0,0,0,0,0,1
3,6,,,people receive wildfires evacuation orders in ...,1,2,#wildfires,,1,0,...,7,0,0,0,0,0,0,0,0,1
4,7,,,just got sent this photo from ruby alaska as s...,1,2,#alaska #wildfires,,2,0,...,16,0,0,0,0,0,0,0,0,1


In [19]:
test.head()

Unnamed: 0,id_original,keyword_original,location_original,text_original,special_chars_count,hashtags,labels,hashtags_count,labels_count,num_chars_count,...,words_count,keyword_original_0,keyword_original_1,keyword_original_2,keyword_original_3,keyword_original_4,keyword_original_5,keyword_original_6,keyword_original_7,keyword_original_8
0,0,,,just happened terrible car crash,0,,,0,0,0,...,6,0,0,0,0,0,0,0,0,1
1,2,,,heard about earthquake is different cities sta...,3,#earthquake,,1,0,0,...,9,0,0,0,0,0,0,0,0,1
2,3,,,there is forest fire at spot pond geese are f...,2,,,0,0,0,...,19,0,0,0,0,0,0,0,0,1
3,9,,,apocalypse lighting spokane wildfires,3,#spokane #wildfires,,2,0,0,...,4,0,0,0,0,0,0,0,0,1
4,11,,,typhoon soudelor kills in china and taiwan,0,,,0,0,2,...,7,0,0,0,0,0,0,0,0,1


In [11]:
test.loc[:,["text_original","clean_text"]]

Unnamed: 0,text_original,clean_text
0,just happened terrible car crash,happened terrible car crash
1,heard about earthquake is different cities sta...,heard earthquake different cities stay safe ev...
2,there is forest fire at spot pond geese are f...,forest fire spot pond geese fleeing across str...
3,apocalypse lighting spokane wildfires,apocalypse lighting spokane wildfires
4,typhoon soudelor kills in china and taiwan,typhoon soudelor kills china taiwan
...,...,...
3258,earthquake safety los angeles uo safety fasten...,earthquake safety los angeles uo safety fasten...
3259,storm in ri worse than last hurricane my city ...,storm ri worse last hurricane city others hard...
3260,green line derailment in chicago,green line derailment chicago
3261,meg issues hazardous weather outlook hwo,meg issues hazardous weather outlook hwo


In [21]:
test.info() #quedo un nan en clean_text

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id_original          3263 non-null   int64  
 1   keyword_original     3237 non-null   object 
 2   location_original    2158 non-null   object 
 3   text_original        3263 non-null   object 
 4   special_chars_count  3263 non-null   int64  
 5   hashtags             773 non-null    object 
 6   labels               904 non-null    object 
 7   hashtags_count       3263 non-null   int64  
 8   labels_count         3263 non-null   int64  
 9   num_chars_count      3263 non-null   int64  
 10  links_count          3263 non-null   int64  
 11  clean_text           3262 non-null   object 
 12  text_length          3263 non-null   int64  
 13  mean_word_length     3263 non-null   float64
 14  vowels_count         3263 non-null   int64  
 15  short_words_count    3263 non-null   i

In [23]:
def remove_punctuation(word):        
    clean_word = ''.join([char for char in word if char not in string.punctuation])
    return clean_word

def cleaning_text(text):
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    text_tokenize = tokenizer.tokenize(text)
    wordlist = []
    for word in text_tokenize:
        word = word.lower()
        word = re.sub('(?P<url>https?://[^\s]+)', ' ', word)
        word = remove_punctuation(word)
        word = re.sub(r'[^\w]', ' ', word)
        word = unidecode.unidecode(word)
        word = re.sub(r'[0-9]','', word)
        if((word != '')&(word != ' ')&(word not in stopwords)):
            wordlist.append(word)
    clean_text = ' '.join(wordlist)
    return clean_text

test["clean_text"] = test["text_original"].apply(lambda x: cleaning_text(x))

In [8]:
# feature selection
def select_features(X_train, y_train, X_test, function, k):
    fs = SelectKBest(score_func=function, k=k)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs

### Bag of Words

In [54]:
vectorizer = CountVectorizer(stop_words='english') #con binary=True me dió 0.726386
vectorizer.fit(train["clean_text"])

vec_train = vectorizer.fit_transform(train["clean_text"])
feature_words = vectorizer.get_feature_names()
print(len(feature_words))
df_bow_train = pd.DataFrame(vec_train.toarray(), columns=feature_words)
df_bow_train.head()

14190


Unnamed: 0,aa,aaa,aaaand,aaalll,aaarrrgghhh,aaemiddleaged,aal,aan,aannnd,aar,...,zones,zonewolf,zoom,zotar,zouma,zrnf,zss,zumiez,zurich,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
vec_test = vectorizer.transform(test["clean_text"])
df_bow_test = pd.DataFrame(vec_test.toarray(), columns=feature_words)
df_bow_test.head()

Unnamed: 0,aa,aaa,aaaand,aaalll,aaarrrgghhh,aaemiddleaged,aal,aan,aannnd,aar,...,zones,zonewolf,zoom,zotar,zouma,zrnf,zss,zumiez,zurich,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
filter_bow_train = df_bow_train.loc[:,(df_bow_train.sum()>1)]
filter_bow_train.head()

Unnamed: 0,aa,aba,abandon,abandoned,abbott,abbswinston,abc,abcnews,abe,abia,...,zero,zimbabwe,zionism,zionist,zippednews,zombie,zone,zones,zouma,zss
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
filter_list = filter_bow_train.columns.tolist()
filter_bow_test = df_bow_test.filter(items=filter_list)
filter_bow_test.shape

(3263, 6049)

In [58]:
train_final_1 = pd.concat([train,filter_bow_train], axis="columns")
print(train_final_1.shape)
train_final_2 = pd.concat([train,df_bow_train], axis="columns")
print(train_final_2.shape)
test_final_1 = pd.concat([test,filter_bow_test], axis="columns")
print(test_final_1.shape)
test_final_2 = pd.concat([test,df_bow_test], axis="columns")
print(test_final_2.shape)

(7613, 6077)
(7613, 14218)
(3263, 6076)
(3263, 14217)


#### Con las palabras con frecuencia mayor a 1

In [59]:
X = train_final_1.drop(["id_original","keyword_original","location_original","text_original","target_label",
                      "clean_text", "hashtags", "labels"], axis=1)
y = train_final_1["target_label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

print(X_train.shape)
print(X_test.shape)

(5709, 6069)
(1904, 6069)


In [60]:
model = lgb.LGBMClassifier(n_estimators=350, num_leaves=14, max_depth=7, colsample_bytree=0.7, learning_rate=0.1, subsample=1, min_child_samples=10)
model.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               importance_type='split', learning_rate=0.1, max_depth=7,
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=350, n_jobs=-1, num_leaves=14, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1, subsample_for_bin=200000, subsample_freq=0)

In [61]:
# con binary=true 0.802521
y_test_hat = model.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.807773


In [62]:
#b=true 0.780870
kfold = KFold(n_splits=4)
resultados = cross_val_score(model, X_train, y_train, cv=kfold)
print("Accuracy: %f" % (resultados.mean()))

Accuracy: 0.782971


In [37]:
kfold = KFold(n_splits=6)
resultados = cross_val_score(model, X_train, y_train, cv=kfold)
print("Accuracy: %f" % (resultados.mean()))

Accuracy: 0.784199


In [63]:
#reduciendo a 3000 features
df_feat_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=["importancia"]).\
        sort_values(by="importancia",ascending=False)
df_feat_importances.head()

Unnamed: 0,importancia
mean_word_length,269
text_length,202
special_chars_count,168
vowels_count,137
num_chars_count,135


In [64]:
list_fi = df_feat_importances.index[:3000].tolist()
X_fi = X.filter(items=list_fi)
print(X_fi.shape)
print(X_fi.columns)

(7613, 3000)
Index(['mean_word_length', 'text_length', 'special_chars_count',
       'vowels_count', 'num_chars_count', 'words_count', 'stopwords_count',
       'short_words_count', 'keyword_original_7', 'labels_count',
       ...
       'skinny', 'skin', 'skills', 'skill', 'skies', 'sketch', 'skanndtyagi',
       'sj', 'size', 'skirt'],
      dtype='object', length=3000)


In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_fi, y, test_size=0.25, random_state=100)

In [66]:
model = lgb.LGBMClassifier(n_estimators=500, num_leaves=31, max_depth=5, colsample_bytree=0.7, learning_rate=0.1, subsample=1, min_child_samples=10,
                           min_child_weight=0.001)
model.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               importance_type='split', learning_rate=0.1, max_depth=5,
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1, subsample_for_bin=200000, subsample_freq=0)

In [67]:
# b=true 0.806197
y_test_hat = model.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.803571


In [46]:
kfold = KFold(n_splits=4)
resultados = cross_val_score(model, X_train, y_train, cv=kfold)
print("Accuracy: %f" % (resultados.mean()))

Accuracy: 0.787351


In [47]:
kfold = KFold(n_splits=6)
resultados = cross_val_score(model, X_train, y_train, cv=kfold)
print("Accuracy: %f" % (resultados.mean()))

Accuracy: 0.788227


In [68]:
X_train = train_final_1.drop(["id_original","keyword_original","location_original","text_original","target_label","hashtags","labels","clean_text"], axis=1)
X_test = test_final_1.drop(["id_original","keyword_original","location_original","text_original","hashtags","labels","clean_text"], axis=1)
y_train = train_final_1["target_label"]

In [69]:
X_train_fi = X_train.filter(items=list_fi)
X_test_fi = X_test.filter(items=list_fi)

In [70]:
print(X_train_fi.shape)
print(X_test_fi.shape)

(7613, 3000)
(3263, 3000)


In [71]:
model = lgb.LGBMClassifier(n_estimators=500, num_leaves=31, max_depth=5, colsample_bytree=0.7, learning_rate=0.1, subsample=1, min_child_samples=10,
                           min_child_weight=0.001)
model.fit(X_train_fi, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               importance_type='split', learning_rate=0.1, max_depth=5,
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1, subsample_for_bin=200000, subsample_freq=0)

In [72]:
# con binary=true 0.726386
kfold = KFold(n_splits=4)
resultados = cross_val_score(model, X_train_fi, y_train, cv=kfold)
print("Accuracy: %f" % (resultados.mean()))

Accuracy: 0.722445


In [74]:
#reduciendo aun mas las features
df_feat_importances2 = pd.DataFrame(model.feature_importances_, index=X_train_fi.columns, columns=["importancia"]).\
        sort_values(by="importancia",ascending=False)
df_feat_importances2.head()

Unnamed: 0,importancia
mean_word_length,466
text_length,338
special_chars_count,296
num_chars_count,282
vowels_count,241


In [75]:
df_feat_importances2["importancia"].value_counts()

0      2545
1        65
3        58
4        50
2        48
6        33
7        29
8        27
5        22
9        19
10       15
12       14
11       13
13        8
14        6
20        6
17        4
15        3
19        3
30        2
27        2
29        2
49        2
16        2
18        1
22        1
24        1
26        1
28        1
241       1
32        1
42        1
46        1
56        1
192       1
282       1
223       1
338       1
466       1
23        1
25        1
43        1
57        1
85        1
155       1
296       1
Name: importancia, dtype: int64

In [77]:
#nos quedamos con 455
list_fi = df_feat_importances2.index[:455].tolist()
X_fi = X.filter(items=list_fi)
print(X_fi.shape)
print(X_fi.columns)

(7613, 455)
Index(['mean_word_length', 'text_length', 'special_chars_count',
       'vowels_count', 'num_chars_count', 'words_count', 'stopwords_count',
       'short_words_count', 'keyword_original_7', 'labels_count',
       ...
       'island', 'sick', 'signs', 'offensive', 'costlier', 'water', 'escape',
       'looking', 'makes', 'version'],
      dtype='object', length=455)


In [78]:
X_train, X_test, y_train, y_test = train_test_split(X_fi, y, test_size=0.25, random_state=100)

In [79]:
model = lgb.LGBMClassifier(n_estimators=500, num_leaves=31, max_depth=5, colsample_bytree=0.7, learning_rate=0.1, 
                           subsample=1, min_child_samples=10)
model.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               importance_type='split', learning_rate=0.1, max_depth=5,
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1, subsample_for_bin=200000, subsample_freq=0)

In [80]:
y_test_hat = model.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.805672


In [81]:
hiper_parametros = {"n_estimators":[200,300,500],
                   "max_depth":[5,7,9,11],
                   "colsample_bytree":[0.5,0.7],
                   "num_leaves":[7,14,31],
                   "learning_rate":[0.1],
                   "min_child_samples":[10],
                    "n_jobs": [1]}

In [82]:
clasif = GridSearchCV(model, hiper_parametros, cv=4, scoring='accuracy')
clasif.fit(X_train, y_train)

GridSearchCV(cv=4, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=0.7,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=5,
                                      min_child_samples=10,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=500,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=True, subsample=1,
                                      subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='deprecated', n_jobs=None,
             param_grid={'colsample_bytree': [0.5, 0.7], 'learning_rate': [0.1],
                

In [83]:
print(clasif.best_estimator_)
print(clasif.best_score_)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               importance_type='split', learning_rate=0.1, max_depth=11,
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=1, num_leaves=14, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1, subsample_for_bin=200000, subsample_freq=0)
0.7899801301038986


In [84]:
df_feat_importances.shape

(6069, 1)

In [85]:
df_feat_importances["importancia"].value_counts()

0      5606
1        75
2        60
5        51
4        50
3        47
7        27
6        25
8        18
10       17
9        14
11       14
14        9
12        8
13        7
17        4
21        3
25        3
20        3
19        3
27        3
24        2
39        1
43        1
168       1
124       1
44        1
28        1
269       1
97        1
137       1
35        1
47        1
18        1
22        1
30        1
42        1
118       1
202       1
15        1
23        1
135       1
Name: importancia, dtype: int64

In [88]:
list_fi = df_feat_importances.index[:463].tolist()
X_fi = X.filter(items=list_fi)
print(X_fi.shape)
print(X_fi.columns)

(7613, 463)
Index(['mean_word_length', 'text_length', 'special_chars_count',
       'vowels_count', 'num_chars_count', 'words_count', 'stopwords_count',
       'short_words_count', 'keyword_original_7', 'labels_count',
       ...
       'makes', 'version', 'likely', 'rescuers', 'emmerdale', 'engulfed',
       'reuters', 'right', 'level', 'rly'],
      dtype='object', length=463)


In [89]:
X_train, X_test, y_train, y_test = train_test_split(X_fi, y, test_size=0.25, random_state=100)

In [90]:
model = lgb.LGBMClassifier(n_estimators=500, num_leaves=31, max_depth=5, colsample_bytree=0.7, learning_rate=0.1, 
                           subsample=1, min_child_samples=10)
model.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               importance_type='split', learning_rate=0.1, max_depth=5,
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1, subsample_for_bin=200000, subsample_freq=0)

In [91]:
y_test_hat = model.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.803046


In [92]:
hiper_parametros = {"n_estimators":[300,400,500],
                   "max_depth":[5,7,9,11],
                   "colsample_bytree":[0.5,0.7,0.8],
                   "num_leaves":[7,14,31],
                   "learning_rate":[0.1],
                   "min_child_samples":[10],
                    "n_jobs": [1]}

In [93]:
clasif = GridSearchCV(model, hiper_parametros, cv=4, scoring='accuracy')
clasif.fit(X_train, y_train)

GridSearchCV(cv=4, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=0.7,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=5,
                                      min_child_samples=10,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=500,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=True...ample=1,
                                      subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='deprecated', n_jobs=None,
             param_grid={'colsample_bytree': [0.5, 0.7, 0.8],
                         'learning_rat

In [94]:
print(clasif.best_estimator_)
print(clasif.best_score_)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
               importance_type='split', learning_rate=0.1, max_depth=11,
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=400, n_jobs=1, num_leaves=14, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1, subsample_for_bin=200000, subsample_freq=0)
0.7926067939439265


In [96]:
X_train = train_final_1.drop(["id_original","keyword_original","location_original","text_original","target_label","hashtags","labels","clean_text"], axis=1)
X_test = test_final_1.drop(["id_original","keyword_original","location_original","text_original","hashtags","labels","clean_text"], axis=1)
y_train = train_final_1["target_label"]

In [97]:
list_fi = df_feat_importances.index[:463].tolist()
X_train_fi = X_train.filter(items=list_fi)
X_test_fi = X_test.filter(items=list_fi)

In [98]:
print(X_train_fi.shape)
print(X_test_fi.shape)

(7613, 463)
(3263, 463)


In [101]:
model = lgb.LGBMClassifier(n_estimators=400, num_leaves=14, max_depth=11, colsample_bytree=0.5, learning_rate=0.1, 
                           subsample=1, min_child_samples=10)
model.fit(X_train_fi, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
               importance_type='split', learning_rate=0.1, max_depth=11,
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=400, n_jobs=-1, num_leaves=14, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1, subsample_for_bin=200000, subsample_freq=0)

In [102]:
y_pred = model.predict(X_test_fi)
y_pred

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [103]:
test["target"] = y_pred
test[["id_original","target"]]

Unnamed: 0,id_original,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [104]:
test[["id_original","target"]].rename(columns={"id_original":"id"}).to_csv("../data/pred14_LGBM_bowbin_fi", index=False)

#### Con todas las palabras

In [105]:
X = train_final_2.drop(["id_original","keyword_original","location_original","text_original","target_label",
                      "clean_text", "hashtags", "labels"], axis=1)
y = train_final_2["target_label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

print(X_train.shape)
print(X_test.shape)

(5709, 14209)
(1904, 14209)


In [114]:
model = lgb.LGBMClassifier(n_estimators=450, num_leaves=14, max_depth=7, colsample_bytree=0.7, learning_rate=0.1, subsample=1, min_child_samples=10)
model.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               importance_type='split', learning_rate=0.1, max_depth=7,
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=450, n_jobs=-1, num_leaves=14, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1, subsample_for_bin=200000, subsample_freq=0)

In [115]:
y_test_hat = model.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.808298


In [116]:
df_feat_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=["importancia"]).\
        sort_values(by="importancia",ascending=False)
df_feat_importances.head()

Unnamed: 0,importancia
mean_word_length,344
text_length,258
special_chars_count,208
num_chars_count,175
vowels_count,167


In [117]:
df_feat_importances["importancia"].value_counts()

0      13667
1         95
2         63
6         48
4         46
3         44
5         42
7         36
8         31
9         18
10        17
13        13
11        13
14        11
12        10
16         7
15         6
30         3
17         3
20         3
19         3
22         3
25         2
49         2
29         2
21         2
26         2
36         2
24         2
56         1
344        1
144        1
127        1
208        1
46         1
54         1
33         1
154        1
258        1
167        1
59         1
175        1
Name: importancia, dtype: int64

In [119]:
# nos quedamos con las features con un valor de importancia mayor a 0
list_fi = df_feat_importances.index[:542].tolist()
X_fi = X.filter(items=list_fi)
print(X_fi.shape)
print(X_fi.columns)

(7613, 542)
Index(['mean_word_length', 'text_length', 'special_chars_count',
       'num_chars_count', 'vowels_count', 'stopwords_count', 'words_count',
       'short_words_count', 'keyword_original_4', 'keyword_original_7',
       ...
       'help', 'terror', 'drowned', 'looks', 'words', 'episode', 'response',
       'causing', 'emergency', 'takes'],
      dtype='object', length=542)


In [120]:
X_train, X_test, y_train, y_test = train_test_split(X_fi, y, test_size=0.25, random_state=100)

In [151]:
model = lgb.LGBMClassifier(n_estimators=500, num_leaves=7, max_depth=5, colsample_bytree=0.7, learning_rate=0.1, 
                           subsample=1, min_child_samples=10)
model.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               importance_type='split', learning_rate=0.1, max_depth=5,
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=-1, num_leaves=7, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1, subsample_for_bin=200000, subsample_freq=0)

In [152]:
y_test_hat = model.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.809874


In [153]:
X_train = train_final_2.drop(["id_original","keyword_original","location_original","text_original","target_label","hashtags","labels","clean_text"], axis=1)
X_test = test_final_2.drop(["id_original","keyword_original","location_original","text_original","hashtags","labels","clean_text"], axis=1)
y_train = train_final_2["target_label"]

In [154]:
list_fi = df_feat_importances.index[:542].tolist()
X_train_fi = X_train.filter(items=list_fi)
X_test_fi = X_test.filter(items=list_fi)

In [157]:
model = lgb.LGBMClassifier(n_estimators=500, num_leaves=7, max_depth=5, colsample_bytree=0.7, learning_rate=0.1, 
                           subsample=1, min_child_samples=10)
model.fit(X_train_fi, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               importance_type='split', learning_rate=0.1, max_depth=5,
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=500, n_jobs=-1, num_leaves=7, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1, subsample_for_bin=200000, subsample_freq=0)

In [158]:
y_pred = model.predict(X_test_fi)
y_pred

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [159]:
test["target"] = y_pred
test[["id_original","target"]]

Unnamed: 0,id_original,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [160]:
test[["id_original","target"]].rename(columns={"id_original":"id"}).to_csv("../data/pred15_LGBM_bowbin_fi", index=False)