# <span style="color:#9c8f8f"> 75.06/95.58 Organización de Datos</span>
# <span style="color:#9c8f8f"> Análisis exploratorio: Real or Not? NLP with Disaster Tweets</span>

# <center>FEATURE ENGINEERING</center>
# <center>Decision trees</center>

In [1]:
# Cargo librerias

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import tree

In [2]:
# Cargo train and test

dtype_train = {"id": np.int32, "keyword": "category", "target" : int}
train = pd.read_csv("original_data/train.csv", dtype = dtype_train, encoding='UTF_8')

In [3]:
# Creo dataframe con target

target_train = train[["id","target"]]

In [4]:
DIR_CSV = "features/"

# I) Understanding decision trees

* In the decision tree chart, each internal node has a decision rule that splits the data.

* Gini referred as Gini ratio, which measures the impurity of the node. You can say a node is pure when all of its records belong to the same class, such nodes known as the leaf node.

* There is no need to normalize columns.

# II) Auxiliary Functions

In [5]:
dtype_train = {"id": np.int32, "keyword": "category", "target" : int}
train = pd.read_csv("original_data/train.csv", dtype = dtype_train, encoding='UTF_8')

target_train = train[["id","target"]]
    
def get_clean_values(df):
    
    if("target" in list(df.columns)):
        del df["target"]
    df = df.merge(target_train, how="inner", on="id")   
    
    X = df.copy().fillna(0)
    y = df["target"].copy()
    
    if("target" in list(X.columns)):
        del X['target']
    if("id" in list(X.columns)):
        del X['id']
    if("Unnamed: 0" in list(X.columns)):
        del X["Unnamed: 0"]
    
    return X,y

In [6]:
def decision_regressor_feature_search(X,y,m):
    
    # Model creation
    
    clf = tree.DecisionTreeRegressor()     # DecisionTreeClassifier
    clf = clf.fit(X, y)
    
    # Feature importance
    feature_importances = clf.feature_importances_

    # get index of m most important features sorted
    m_index = np.argpartition(feature_importances, -m)[-m:]
    m_index = m_index[np.argsort(feature_importances[m_index])][::-1]
    
    # Get the m most important features of locations
    m_feature_importances = feature_importances[m_index]
    
    # Get m feature name
    m_feature_name = X.columns[m_index]
    
    return feature_importances, m_index, m_feature_importances, m_feature_name

In [7]:
# m cantidad de features con más relevancia

def decision_regressor_files(previous_dir, files, m_value):
    
    # contiene una lista por archivo con feature_importances, m_index, m_feature_importances, m_feature_name
    locations_others_info = []
    
    for i in range(len(files)):
        
        # cargo archivo
        df = pd.read_csv(previous_dir + files[i], low_memory=False)

        # Separo mi data
        X, y = get_clean_values(df)
        
        # Verifico m
        m = m_value if (m_value < len(X.columns)) else len(X.columns)

        # Aplico desition tree
        feature_importances, m_index, m_feature_importances, m_feature_name = decision_regressor_feature_search(X,y,m)
        locations_others_info.append([feature_importances, m_index, m_feature_importances, m_feature_name])

        # Imprimo informacion
        print_feature_search(m_feature_name, m_feature_importances, files[i])
    
    return locations_others_info

In [8]:
def plot_decision_tree(X,y,depth):
    
    clf = tree.DecisionTreeClassifier(max_depth=depth)
    clf= clf.fit(X, y)

    plt.figure(1, figsize=(10,10))
    tree.plot_tree(clf, filled=True)
    
    plt.show()

In [9]:
def print_feature_search(feature, importance, file):
    
    print("{}\n".format(file.upper()))
    print("{: >20} {: >20}\n".format("feature","importance"))
    for i in range(len(feature)):
        print("{: >20}\t\t{: >20}".format(feature[i],importance[i]))
    print("\n\n")

# III) Desition trees

In [10]:
m = 40

## 1. Keywords

In [11]:
files_keywords = ["features_keywords_in_text_encoded.csv", "features_keywords_numerical.csv"]
DT_keywords = decision_regressor_files(DIR_CSV + 'train/' ,files_keywords, m)

FEATURES_KEYWORDS_IN_TEXT_ENCODED.CSV

             feature           importance

              K_fire		 0.05661728198438555
              K_bomb		 0.05199125994731499
             K_storm		 0.03666297992258409
           K_typhoon		 0.02589693991662052
            K_debris		0.024404951233193688
            K_derail		0.022208328518238923
            K_police		 0.02138398189071544
             K_flood		 0.02096582029845426
          K_wreckage		0.020289812809413856
          K_outbreak		0.020022068256730207
         K_oil spill		0.018819244115054035
             K_crash		 0.01876575699303485
            K_attack		0.017740656876277332
        K_earthquake		0.016631669122568037
           K_rescuer		  0.0166026677309317
          K_disaster		0.015612747053930818
          K_accident		0.015436585918118919
         K_terrorism		0.015395538287998126
          K_hellfire		0.013377229786418357
       K_mass murder		 0.01332395741314208
   K_bridge collapse		0.011366075265816006
         K_coll

## 2. Text 

In [12]:
files_text = [
    'features_tagged.csv',
    'features_tagged_smooth.csv',
    'features_hashtags.csv',
    'features_hashtags_smooth.csv',
    'features_text_numerical.csv',
    'features_arrobas_numerical.csv',
    'features_hashtags_numerical.csv'
]

DT_text = decision_regressor_files(DIR_CSV + 'train/', files_text, m)

FEATURES_TAGGED.CSV

             feature           importance

             youtube		0.010582048365137325
        arianagrande		0.004724382605659409
             foxnews		0.004310331898460457
                  ap		0.0037081453828599134
           djicemoon		0.003000759840098269
       unsuckdcmetro		0.0029690335847644086
              usagov		0.0029658657708572264
       mikeparractor		0.002569316874459993
       mnpdnashville		0.0022333035430166432
                 smh		0.0022315109247883626
       nasahurricane		0.0022297204640301586
           sharethis		0.0022279321572820907
             nbcnews		0.002226146001089442
             invalid		0.0021329336559973823
            usatoday		0.002064692404274339
             falphil		0.0017234574851221875
       worldnetdaily		0.0017157554688560113
         lonewolffur		0.0017139135413533361
      local_arsonist		0.0017120745783260683
                 gop		0.0017102385734165282
       raynbowaffair		0.0017084055202827094
         barackobam

## 3. Locations

In [13]:
files_location = [
    #'features_location_bow.csv',
    #'features_location_tf_idf.csv',
    'features_location_numerical.csv',
    'features_location_in_text_numerical.csv',
    'features_location_in_text_numerical.csv'
]

DT_location = decision_regressor_files(DIR_CSV + 'train/' , files_location, 10)

FEATURES_LOCATION_NUMERICAL.CSV

             feature           importance

       has_continent		  0.4201219975712325
            has_city		 0.36387304531891634
          has_county		 0.09863419218255298
           has_state		 0.08259389943508827
         has_country		0.034776865492209985



FEATURES_LOCATION_IN_TEXT_NUMERICAL.CSV

             feature           importance

       has_continent		  0.9667359633952932
            has_city		 0.01987512462093934
           has_state		0.010282301066125493
         has_country		 0.00289749217735423
          has_county		0.00020911874028778352



FEATURES_LOCATION_IN_TEXT_NUMERICAL.CSV

             feature           importance

       has_continent		  0.9667359633952932
            has_city		 0.01987512462093934
           has_state		0.010282301066125493
         has_country		 0.00289749217735423
          has_county		0.00020911874028778352





In [14]:
files_location_places = ['features_city_encoded.csv',
    'features_city_in_text_encoded.csv',
    'features_country_encoded.csv',
    'features_country_in_text_encoded.csv',
    'features_county_encoded.csv',
    'features_county_in_text_encoded.csv',
    'features_state_encoded.csv',
    'features_state_in_text_encoded.csv',
    'features_continent_encoded.csv',
    'features_continent_in_text_encoded.csv']

DT_location_places = decision_regressor_files(DIR_CSV + 'train/' , files_location_places, m)

FEATURES_CITY_ENCODED.CSV

             feature           importance

         city_Mumbai		0.024130735826127764
     city_Washington		 0.02394322232816508
           city_York		  0.0215470796853544
     city_Sacramento		0.012510485874967319
        city_Karachi		0.009170525971629378
       city_Coventry		0.007721289501625106
         city_Oregon		0.007579538694094622
        city_Calgary		0.006908095728359464
     city_Birmingham		0.006899548099176786
         city_Dundee		0.0068975435654399266
        city_Concord		0.006891929720974994
        city_Jakarta		0.006888352077922537
      city_Nashville		0.006412119674766605
       city_Oklahoma		0.0063961588328879425
        city_Orlando		0.005722531824631754
        city_Memphis		0.0052113709771334105
      city_Edinburgh		0.005130112091634571
     city_Pittsburgh		 0.00512451000362813
          city_Dubai		 0.00486434183394351
      city_Blackpool		0.004630410725532538
        city_Bandung		0.0046278703822451184
 city_Corpus Christi		0

FEATURES_STATE_IN_TEXT_ENCODED.CSV

             feature           importance

    state_California		  0.4961074525511288
        state_Hawaii		 0.05711911206584368
       state_Alabama		0.056061722932534634
      state_Oklahoma		 0.05112350262978203
            state_AL		 0.03666658891180527
    state_Washington		 0.03411316507677217
      state_Maryland		0.026302154670002218
            state_CA		 0.02111266832404009
       state_Georgia		0.021089926793594813
            state_NC		  0.0210672219874664
      state_Colorado		  0.0182850946435405
       state_Vermont		 0.01589419527449522
      state_Illinois		0.015881313311553104
      state_Michigan		0.011498056412031546
     state_Tennessee		0.011482512196246361
       state_Arizona		0.007996262805361804
        state_Nevada		0.005324617225432336
         state_Maine		0.0053231728697533485
            state_TX		0.005321729101681555
        state_Kansas		0.005320285920910415
            state_NY		0.005318843327112242
  state_South Dak

## 4. Links

In [15]:
files_links = ['features_links_numerical.csv', "features_domain_bow.csv", "features_domain_tf_idf.csv", "features_domain_smooth_tf_idf.csv"]
DT_links = decision_regressor_files(DIR_CSV + 'train/' , files_links, m)

FEATURES_LINKS_NUMERICAL.CSV

             feature           importance

          links_cant		  0.8897528194199258
   cant_failed_links		  0.1102471805800743
                id.1		                 0.0



FEATURES_DOMAIN_BOW.CSV

             feature           importance

         domain_ebay		  0.0512237262884467
          domain_bbc		0.028344309024258493
         domain_news		0.027502150165272007
      domain_youtube		0.018201937085652382
      domain_twitter		0.016257405836360622
          domain_cue		 0.01576063041811373
      domain_latimes		0.015562245113309775
  domain_feedsportal		0.012090302911725921
      domain_abcnews		 0.01200395749289994
          domain_gov		0.011884700269472852
        domain_wired		0.010961997440096568
    domain_billboard		0.009887723677175764
       domain_amazon		0.009863330756510488
    domain_instagram		0.009198231569809102
    domain_careerarc		  0.0088760208828333
       domain_tumblr		0.008671780796415746
           domain_co		 0.00835978290945

## 5. Tags

In [16]:
files_tags = ["features_tags_bow.csv", "features_tags_tf_idf.csv"]
DT_tags = decision_regressor_files(DIR_CSV + 'train/' , files_tags, m)

FEATURES_TAGS_BOW.CSV

             feature           importance

            tag_fire		 0.02531172565654727
       tag_hiroshima		0.022525718673642364
         tag_suicide		0.022401693126487286
              tag_mh		0.018473834147994972
           tag_train		0.018053663449641825
        tag_wildfire		0.017683919457758285
          tag_killed		0.016392142359738293
         tag_typhoon		0.014789121951377274
         tag_migrant		 0.01242132183260013
           tag_storm		0.012337473646370202
           tag_spill		0.011867982649404833
     tag_legionnaire		0.011732570408719679
          tag_police		0.010948088241515828
        tag_accident		0.010638078354814838
          tag_people		0.010537487239784971
           tag_japan		0.009449557657171533
      tag_earthquake		0.009228277484254147
         tag_israeli		0.008688675542163073
         tag_airport		 0.00799500899293645
        tag_collapse		0.007834305833797671
            tag_love		0.007456407353079425
        tag_building		0.0068145

# IV) Conclusion

In [17]:
list_of_files = [
    files_keywords,
    files_text,
    files_location,
    files_location_places,
    files_links,
    files_tags,
]

In [18]:
# Information of desition trees for m or less features

DT = [
    DT_keywords,
    DT_text,
    DT_location,
    DT_location_places,
    DT_links,
    DT_tags,
]

In [19]:
# Function that filters those features with lesser importance than min_importance

def filter_importance(DT, min_importance):
    feature_importances, m_index, m_feature_importances, m_feature_name = DT
    
    res = []
    for i in range(len(m_feature_importances)):
        if(m_feature_importances[i]>= min_importance):
            res.append(m_feature_name[i])
    return res

In [20]:
# Create a dataframe with all the m features for a csv filtered 

min_importance = 0.00001

df_total = []

for i in range(len(list_of_files)):
    
    for j in range(len(list_of_files[i])):
        
        df = pd.read_csv(DIR_CSV + 'train/'+ list_of_files[i][j] ,header=0)
        
        cols = list(filter_importance(DT[i][j], min_importance))
        
        cols.append("id")
                               
        df_total.append(df[cols])

In [21]:
for df in df_total:
    print(list(df.columns),'\n')

['K_fire', 'K_bomb', 'K_storm', 'K_typhoon', 'K_debris', 'K_derail', 'K_police', 'K_flood', 'K_wreckage', 'K_outbreak', 'K_oil spill', 'K_crash', 'K_attack', 'K_earthquake', 'K_rescuer', 'K_disaster', 'K_accident', 'K_terrorism', 'K_hellfire', 'K_mass murder', 'K_bridge collapse', 'K_collision', 'K_emergency', 'K_evacuation', 'K_massacre', 'K_death', 'K_drought', 'K_sinkhole', 'K_casualty', 'K_terrorist', 'K_dead', 'K_refugee', 'K_rescue', 'K_burn', 'K_collapse', 'K_heat wave', 'K_evacuate', 'K_hostage', 'K_flame', 'K_hijack', 'id'] 

['keyword_frequency', 'keywords_mean', 'text_contains_keyword', 'keywords_quantity', 'keyword_is_hashtag', 'has_keyword', 'id'] 

['youtube', 'arianagrande', 'foxnews', 'ap', 'djicemoon', 'unsuckdcmetro', 'usagov', 'mikeparractor', 'mnpdnashville', 'smh', 'nasahurricane', 'sharethis', 'nbcnews', 'invalid', 'usatoday', 'falphil', 'worldnetdaily', 'lonewolffur', 'local_arsonist', 'gop', 'raynbowaffair', 'barackobama', 'potus', 'ameenshaikh3', 'newz_sacramen

In [22]:
df_final = train[["id", "target"]]
df_final

Unnamed: 0,id,target
0,1,1
1,4,1
2,5,1
3,6,1
4,7,1
5,8,1
6,10,1
7,13,1
8,14,1
9,15,1


In [23]:
# Mergeo todos los dataframe

for df in df_total:
    df_final = df_final.merge(df, on="id", how="left")

len(df_final.columns)

738

In [24]:
# Create a dataframe with all the m features for a csv filtered 

min_importance = 0.00001

df_total2 = []

for i in range(len(list_of_files)):
    
    for j in range(len(list_of_files[i])):
        
        df = pd.read_csv(DIR_CSV + 'test/'+ list_of_files[i][j] ,header=0)
        
        cols = list(filter_importance(DT[i][j], min_importance))
        
        cols.append("id")
                               
        df_total2.append(df[cols])

In [25]:
df_final2 = pd.read_csv('original_data/test.csv')
df_final2 = df_final2[['id']]
df_final2.head()

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11


In [26]:
# Mergeo todos los dataframe

for df in df_total2:
    df_final2 = df_final2.merge(df, on="id", how="left")

len(df_final2.columns)

737

In [27]:
df_final2.to_csv('feature_selection_on_test_DT')

In [28]:
X,y = get_clean_values(df_final)
DT_final = decision_regressor_feature_search(X,y,m)

In [29]:
print_feature_search(DT_final[-2],DT_final[-1],"{} features finales más importantes".format(m))

40 FEATURES FINALES MÁS IMPORTANTES

             feature           importance

 0.08537358541619447		     has_continent_y
 0.08479022467225215		   promedio_len_word
  0.0638635158332894		   keyword_frequency
0.057470878666624856		       keywords_mean
 0.05113812417689147		         #caracteres
0.045948649389285556		            #silabas
0.036801991773757564		         #capitalize
 0.03141921805784661		          links_cant
0.028236316109474947		          #stopwords
0.026191544078244974		    #palabras_unicas
0.018400802105089823		#caracteres_especiales
0.018201084423513005		           #palabras
0.017769144157918925		         #mayusculas
0.015965944747483904		       #tagged_users
0.015230171000291223		         #puntuacion
0.012540556884883406		              K_bomb
0.012416206611171338		           #hashtags
0.011247605063285114		             K_storm
0.008932871814669747		text_contains_keyword
0.008482620168368631		     #silabas_binned
0.0077213106376874535		  #capitalize_binned
0.00742990335

In [30]:
# m = 30 y min_importance = 0.00001
#df_final.to_csv('features/feature_selection/features_decition_tree1.csv', encoding='utf-8')

In [31]:
# m = 40 y min_importance = 0.00001
#df_final.to_csv('features/feature_selection/features_decition_tree2.csv', encoding='utf-8')