# <span style="color:#9c8f8f"> 75.06/95.58 Organización de Datos</span>
# <span style="color:#9c8f8f"> Análisis exploratorio: Real or Not? NLP with Disaster Tweets</span>

# <center>FEATURE ENGINEERING</center>
# <center>Decision trees</center>

In [1]:
# Cargo librerias

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import tree

In [2]:
# Cargo train and test

dtype_train = {"id": np.int32, "keyword": "category", "target" : int}
train = pd.read_csv("original_data/train.csv", dtype = dtype_train, encoding='UTF_8')

In [3]:
# Creo dataframe con target

target_train = train[["id","target"]]

In [4]:
DIR_CSV = "features/"

# I) Understanding decision trees

* In the decision tree chart, each internal node has a decision rule that splits the data.

* Gini referred as Gini ratio, which measures the impurity of the node. You can say a node is pure when all of its records belong to the same class, such nodes known as the leaf node.

* There is no need to normalize columns.

# II) Auxiliary Functions

In [5]:
dtype_train = {"id": np.int32, "keyword": "category", "target" : int}
train = pd.read_csv("original_data/train.csv", dtype = dtype_train, encoding='UTF_8')

target_train = train[["id","target"]]
    
def get_clean_values(df):
    
    if("target" in list(df.columns)):
        del df["target"]
    df = df.merge(target_train, how="inner", on="id")   
    
    X = df.copy().fillna(0)
    y = df["target"].copy()
    
    if("target" in list(X.columns)):
        del X['target']
    if("id" in list(X.columns)):
        del X['id']
    if("Unnamed: 0" in list(X.columns)):
        del X["Unnamed: 0"]
    
    return X,y

In [6]:
def decision_regressor_feature_search(X,y,m):
    
    # Model creation
    
    clf = tree.DecisionTreeRegressor()     # DecisionTreeClassifier
    clf = clf.fit(X, y)
    
    # Feature importance
    feature_importances = clf.feature_importances_

    # get index of m most important features sorted
    m_index = np.argpartition(feature_importances, -m)[-m:]
    m_index = m_index[np.argsort(feature_importances[m_index])][::-1]
    
    # Get the m most important features of locations
    m_feature_importances = feature_importances[m_index]
    
    # Get m feature name
    m_feature_name = X.columns[m_index]
    
    return feature_importances, m_index, m_feature_importances, m_feature_name

In [7]:
# m cantidad de features con más relevancia

def decision_regressor_files(previous_dir, files, m_value):
    
    # contiene una lista por archivo con feature_importances, m_index, m_feature_importances, m_feature_name
    locations_others_info = []
    
    for i in range(len(files)):
        
        # cargo archivo
        df = pd.read_csv(previous_dir + files[i], low_memory=False)

        # Separo mi data
        X, y = get_clean_values(df)
        
        # Verifico m
        m = m_value if (m_value < len(X.columns)) else len(X.columns)

        # Aplico desition tree
        feature_importances, m_index, m_feature_importances, m_feature_name = decision_regressor_feature_search(X,y,m)
        locations_others_info.append([feature_importances, m_index, m_feature_importances, m_feature_name])

        # Imprimo informacion
        print_feature_search(m_feature_name, m_feature_importances, files[i])
    
    return locations_others_info

In [8]:
def plot_decision_tree(X,y,depth):
    
    clf = tree.DecisionTreeClassifier(max_depth=depth)
    clf= clf.fit(X, y)

    plt.figure(1, figsize=(10,10))
    tree.plot_tree(clf, filled=True)
    
    plt.show()

In [9]:
def print_feature_search(feature, importance, file):
    
    print("{}\n".format(file.upper()))
    print("{: >20} {: >20}\n".format("feature","importance"))
    for i in range(len(feature)):
        print("{: >20}\t\t{: >20}".format(feature[i],importance[i]))
    print("\n\n")

# III) Desition trees

In [10]:
m = 40

## 1. Keywords

In [11]:
files_keywords = ["features_keywords_in_text_encoded.csv", "features_keywords_numerical.csv"]
DT_keywords = decision_regressor_files(DIR_CSV + 'train/' ,files_keywords, m)

FEATURES_KEYWORDS_IN_TEXT_ENCODED.CSV

             feature           importance

              K_fire		0.056617281984385544
              K_bomb		 0.05200454841950622
             K_storm		 0.03664746867099285
           K_typhoon		0.025896939916620516
            K_debris		0.024404951233193684
            K_derail		 0.02220832851823892
            K_police		 0.02137739198129037
          K_outbreak		0.020289812809413853
             K_flood		0.020218412437262422
          K_wreckage		0.020022068256730204
             K_crash		 0.01958192765327941
         K_oil spill		 0.01881924411505403
            K_attack		0.017707400221396566
          K_disaster		0.016918888623250566
        K_earthquake		0.016631669122568037
           K_rescuer		 0.01658511859227331
         K_terrorism		0.015376876025642711
          K_accident		 0.01537468581067278
          K_hellfire		0.013377229786418355
       K_mass murder		0.013340107581551984
   K_bridge collapse		0.011506037721373716
         K_coll

## 2. Text 

In [61]:
files_text = [
    'features_tagged.csv',
    'features_tagged_smooth.csv',
    'features_hashtags.csv',
    'features_hashtags_smooth.csv',
    'features_text_numerical.csv',
    'features_arrobas_numerical.csv',
    'features_hashtags_numerical.csv',
    'features_pronombres.csv',
    'feature_pos_tag.csv'
]

DT_text = decision_regressor_files(DIR_CSV + 'train/', files_text, m)

FEATURES_TAGGED.CSV

             feature           importance

             youtube		0.010582048365137323
        arianagrande		0.004724382605659408
             foxnews		0.004310331898460457
                  ap		0.003708145382859913
           djicemoon		0.003000759840098269
              usagov		0.0029690335847644086
       unsuckdcmetro		0.002965865770857226
       mikeparractor		0.0025693168744599926
       nasahurricane		0.002233303543016643
                 smh		0.0022315109247883626
             nbcnews		0.0022297204640301586
           sharethis		0.0022279321572820903
       mnpdnashville		0.0022261460010894415
             invalid		0.002132933655997382
            usatoday		0.002064692404274339
             falphil		0.0017234574851221873
      local_arsonist		0.001715755468856011
       raynbowaffair		0.001713913541353336
         lonewolffur		0.001712074578326068
                 gop		0.001710238573416528
       worldnetdaily		0.0017084055202827091
               potus		0.0

FEATURE_POS_TAG.CSV

             feature           importance

                 prp		  0.1164002435200459
                  in		 0.11550402984836848
                 nnp		 0.09894618005218606
                  nn		 0.09605179438120319
                  jj		 0.07046675543806599
                  dt		 0.05568629552238953
                 nns		 0.05094599362276444
                  cd		 0.04301825805338503
                  rb		 0.04108921210450305
                 vbd		 0.03982894923735241
                 vbg		 0.03368029916159961
                 vbz		 0.03060890347996394
                  cc		0.028612542633475593
                 vbp		0.028123562921427255
                  vb		 0.02569282686663004
                  md		0.018081167345665065
                 vbn		0.017989238632735965
                  to		0.017882569302045615
                 pos		  0.0109528984252249
                 wrb		0.010463277450379114
                  rp		 0.00887595007151934
                nnps		0.007905312

## 3. Locations

In [14]:
files_location = [
    #'features_location_bow.csv',
    #'features_location_tf_idf.csv',
    'features_location_numerical.csv',
    'features_location_in_text_numerical.csv',
    'features_location_in_text_numerical.csv'
]

DT_location = decision_regressor_files(DIR_CSV + 'train/' , files_location, 10)

FEATURES_LOCATION_NUMERICAL.CSV

             feature           importance

       has_continent		  0.4201219975712325
            has_city		 0.36387304531891634
          has_county		 0.09863419218255298
           has_state		 0.08259389943508827
         has_country		0.034776865492209985



FEATURES_LOCATION_IN_TEXT_NUMERICAL.CSV

             feature           importance

       has_continent		  0.9667359633952932
            has_city		 0.01987512462093934
           has_state		0.010282301066125493
         has_country		 0.00289749217735423
          has_county		0.00020911874028778352



FEATURES_LOCATION_IN_TEXT_NUMERICAL.CSV

             feature           importance

       has_continent		  0.9667359633952932
            has_city		 0.01987512462093934
           has_state		0.010282301066125493
         has_country		 0.00289749217735423
          has_county		0.00020911874028778352





In [15]:
files_location_places = ['features_city_encoded.csv',
    'features_city_in_text_encoded.csv',
    'features_country_encoded.csv',
    'features_country_in_text_encoded.csv',
    'features_county_encoded.csv',
    'features_county_in_text_encoded.csv',
    'features_state_encoded.csv',
    'features_state_in_text_encoded.csv',
    'features_continent_encoded.csv',
    'features_continent_in_text_encoded.csv']

DT_location_places = decision_regressor_files(DIR_CSV + 'train/' , files_location_places, m)

FEATURES_CITY_ENCODED.CSV

             feature           importance

         city_Mumbai		0.024130735826127757
     city_Washington		0.023943222328165075
           city_York		0.021547079685354395
     city_Sacramento		0.012510485874967316
        city_Karachi		0.009170525971629376
       city_Coventry		0.007721289501625104
         city_Oregon		0.007579538694094621
        city_Calgary		0.006908095728359463
        city_Jakarta		0.006899548099176785
        city_Concord		0.006897543565439925
         city_Dundee		0.006891929720974993
     city_Birmingham		0.006888352077922535
      city_Nashville		0.0064121196747666045
       city_Oklahoma		0.006396158832887941
        city_Orlando		0.005722531824631753
        city_Memphis		 0.00521137097713341
     city_Pittsburgh		 0.00513011209163457
      city_Edinburgh		0.005124510003628129
          city_Dubai		0.004864341833943509
       city_Evanston		0.004630410725532537
        city_Bandung		0.004627870382245118
        city_Jupiter		0.00

FEATURES_STATE_ENCODED.CSV

             feature           importance

    state_Washington		 0.13186079377294388
          state_Iowa		 0.05055157123335072
      state_Arkansas		 0.05047095743204493
            state_FL		 0.05039053630749122
        state_Hawaii		 0.05029788403243753
        state_Oregon		  0.0480517105770067
          state_Utah		 0.04634797109803821
  state_Pennsylvania		 0.03844243172829215
            state_VA		  0.0375732394514461
         state_Maine		 0.03387221747398132
            state_TN		0.033346715068333865
       state_Georgia		 0.02561296748722152
     state_Tennessee		0.022849681882840697
         state_Texas		 0.02004974959891469
       state_Indiana		0.019810946194803232
      state_Colorado		0.019761301872423646
            state_PA		 0.01866351485702398
  state_Rhode Island		0.018653429196478926
      state_Delaware		 0.01670270683381855
  state_North Dakota		0.016698186480154763
  state_South Dakota		0.016693667961277258
            state_NY		0.01

## 4. Links

In [16]:
files_links = ['features_links_numerical.csv', "features_domain_bow.csv", "features_domain_tf_idf.csv", "features_domain_smooth_tf_idf.csv"]
DT_links = decision_regressor_files(DIR_CSV + 'train/' , files_links, m)

FEATURES_LINKS_NUMERICAL.CSV

             feature           importance

          links_cant		  0.8897528194199258
   cant_failed_links		  0.1102471805800743
                id.1		                 0.0



FEATURES_DOMAIN_BOW.CSV

             feature           importance

         domain_ebay		  0.0512237262884467
          domain_bbc		0.028344309024258493
         domain_news		0.027502150165272007
      domain_youtube		0.018201937085652382
      domain_twitter		 0.01626195431068817
          domain_cue		 0.01576063041811373
      domain_latimes		0.015562245113309775
  domain_feedsportal		0.012090302911725921
      domain_abcnews		 0.01200395749289994
          domain_gov		0.011884700269472852
        domain_wired		 0.01096572017935843
    domain_billboard		0.009887723677175764
       domain_amazon		0.009867785771661816
    domain_instagram		0.009198231569809102
       domain_tumblr		0.009092688588407253
    domain_careerarc		0.008842008399927551
           domain_co		0.008395783052210

## 5. Tags

In [17]:
files_tags = ["features_tags_bow.csv", "features_tags_tf_idf.csv"]
DT_tags = decision_regressor_files(DIR_CSV + 'train/' , files_tags, m)

FEATURES_TAGS_BOW.CSV

             feature           importance

            tag_fire		 0.02531172565654727
       tag_hiroshima		0.022525718673642364
         tag_suicide		0.022401693126487286
              tag_mh		0.018473834147994972
           tag_train		0.018242015161438464
        tag_wildfire		 0.01778815090543414
          tag_killed		0.016416790855356127
         tag_typhoon		0.014789121951377274
           tag_storm		0.012337473646370202
         tag_migrant		0.012051594398332654
           tag_spill		0.011867982649404833
     tag_legionnaire		0.011732570408719679
          tag_police		0.010983060348817382
          tag_people		0.010277501517131154
        tag_accident		   0.009982311790658
           tag_japan		0.009449557657171533
      tag_earthquake		0.009228277484254147
         tag_israeli		0.008688675542163073
         tag_airport		 0.00802738652104165
        tag_collapse		0.007834305833797671
        tag_building		0.0073030023181045265
            tag_love		0.006669

# IV) Conclusion

In [62]:
list_of_files = [
    files_keywords,
    files_text,
    files_location,
    files_location_places,
    files_links,
    files_tags,
]

In [63]:
# Information of desition trees for m or less features

DT = [
    DT_keywords,
    DT_text,
    DT_location,
    DT_location_places,
    DT_links,
    DT_tags,
]

In [64]:
# Function that filters those features with lesser importance than min_importance

def filter_importance(DT, min_importance):
    feature_importances, m_index, m_feature_importances, m_feature_name = DT
    
    res = []
    for i in range(len(m_feature_importances)):
        if(m_feature_importances[i]>= min_importance):
            res.append(m_feature_name[i])
    return res

In [65]:
# Create a dataframe with all the m features for a csv filtered 

min_importance = 0.00001

df_total = []

for i in range(len(list_of_files)):
    
    for j in range(len(list_of_files[i])):
        
        df = pd.read_csv(DIR_CSV + 'train/'+ list_of_files[i][j] ,header=0)
        
        cols = list(filter_importance(DT[i][j], min_importance))
        
        cols.append("id")
                               
        df_total.append(df[cols])

In [66]:
for df in df_total:
    print(list(df.columns),'\n')

['K_fire', 'K_bomb', 'K_storm', 'K_typhoon', 'K_debris', 'K_derail', 'K_police', 'K_outbreak', 'K_flood', 'K_wreckage', 'K_crash', 'K_oil spill', 'K_attack', 'K_disaster', 'K_earthquake', 'K_rescuer', 'K_terrorism', 'K_accident', 'K_hellfire', 'K_mass murder', 'K_bridge collapse', 'K_collision', 'K_dead', 'K_evacuation', 'K_massacre', 'K_casualty', 'K_emergency', 'K_death', 'K_drought', 'K_sinkhole', 'K_terrorist', 'K_refugee', 'K_burn', 'K_rescue', 'K_heat wave', 'K_evacuate', 'K_collapse', 'K_hijack', 'K_hostage', 'K_flame', 'id'] 

['keyword_frequency', 'keywords_mean', 'text_contains_keyword', 'keywords_quantity', 'keyword_is_hashtag', 'id'] 

['youtube', 'arianagrande', 'foxnews', 'ap', 'djicemoon', 'usagov', 'unsuckdcmetro', 'mikeparractor', 'nasahurricane', 'smh', 'nbcnews', 'sharethis', 'mnpdnashville', 'invalid', 'usatoday', 'falphil', 'local_arsonist', 'raynbowaffair', 'lonewolffur', 'gop', 'worldnetdaily', 'potus', 'barackobama', 'abc', 'wocowae', 'fewmoretweets', 'wordpress

In [67]:
df_final = train[["id", "target"]]

In [68]:
# Mergeo todos los dataframe

for df in df_total:
    df_final = df_final.merge(df, on="id", how="left")

len(df_final.columns)

776

In [69]:
# Create a dataframe with all the m features for a csv filtered 

min_importance = 0.00001

df_total2 = []

for i in range(len(list_of_files)):
    
    for j in range(len(list_of_files[i])):
        
        df = pd.read_csv(DIR_CSV + 'test/'+ list_of_files[i][j] ,header=0)
        
        cols = list(filter_importance(DT[i][j], min_importance))
        
        cols.append("id")
                               
        df_total2.append(df[cols])

In [70]:
df_final2 = pd.read_csv('original_data/test.csv')
df_final2 = df_final2[['id']]
df_final2.head()

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11


In [71]:
# Mergeo todos los dataframe

for df in df_total2:
    df_final2 = df_final2.merge(df, on="id", how="left")

len(df_final2.columns)

775

In [72]:
df_final2.to_csv('feature_selection_on_test_DT')

In [73]:
X,y = get_clean_values(df_final)
DT_final = decision_regressor_feature_search(X,y,m)

In [74]:
print_feature_search(DT_final[-2],DT_final[-1],"{} features finales más importantes".format(m))

40 FEATURES FINALES MÁS IMPORTANTES

             feature           importance

 0.08525312056101526		       has_continent
 0.08268902132308738		                 prp
0.045927134712425405		                  in
 0.04349161331971302		   keyword_frequency
 0.03595844550849421		       keywords_mean
 0.03535971773965446		   promedio_len_word
0.033081047687592416		                  nn
0.027986952926019933		                 nnp
0.027075348637799774		                  jj
0.024147808195105376		                  dt
 0.02183770391077258		                  rb
0.021419134880032945		         #caracteres
0.021398759460920933		            #silabas
  0.0207260451745466		                 nns
0.016417931137588763		                  to
0.015242492606393845		                  cc
0.015057115411609458		         #capitalize
0.014698100511212883		                  cd
0.013107715148975869		                 vbd
0.010669070828095435		           #palabras
0.010663847753230392		   keywords_quantity
0.009635581358815

In [75]:
# m = 30 y min_importance = 0.00001
#df_final.to_csv('features/feature_selection/features_decition_tree1.csv', encoding='utf-8')

In [76]:
#m = 40 y min_importance = 0.00001
df_final.to_csv('features/feature_selection/features_decition_tree2.csv', encoding='utf-8')