In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score



In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

## Define keyword feature generator method

In [163]:
def keyword_feature_generator(keyword, feature_name, data):
    data[feature_name] = 0
    idx = data[data.text.str.contains(keyword, case=False)].index
    data.loc[idx, feature_name] = 1
    return data

## Take the most common 1000 keywords as feature

In [202]:
def featurize_most_common_significant_keywords(train_data, test_data, threshold, number_of_features = 1000):
    train_data = train_data.iloc[:,:5]
    test_data = test_data.iloc[:,:5]

    for element in Counter(" ".join(train_data["text"]).split()).most_common(number_of_features):
        value_counts = train_data[train_data.text.str.contains(element[0], case=False, regex=False)].target.value_counts()
        try:
            if (value_counts[0]/value_counts[1] > (1 + threshold)) or (value_counts[1]/value_counts[0] > (1 + threshold)):
                try:
                    keyword_feature_generator(element[0], element[0],data = train_data)
                    keyword_feature_generator(element[0], element[0], data = test_data)
                except:
                    pass
        except:
            pass

    train_data.fillna(0, inplace = True)
    test_data.fillna(0, inplace = True)
    return train_data, test_data, threshold


## Random Forest Classifier

In [203]:
def generate_train_test(data, test_size = 0.2):
    y = train_data.target.values
    X = train_data.iloc[:,5:].values
    X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = test_size)
    return X_train, X_validation, y_train, y_validation

In [199]:
def random_forest_classification(X_train, y_train, X_test):
    rfc = RandomForestClassifier()
    rfc.fit(X_train,y_train)
    # predictions
    rfc_predict = rfc.predict(X_test)
    return rfc_predict, rfc

In [200]:
def evaluate(threshold, X, predict, y):
    print('threshold : ', threshold)
    print('features : ', len(X[0]))
    print('Precision : ', precision_score(y, predict))
    print('Recall : ', recall_score(y, predict))
    print('Accuracy : ', accuracy_score(y, predict))
    print('F1_score : ', f1_score(y, predict))

In [201]:
for threshold in [0.4]:
    print('----------------------------------------------')
    train_data, test_data, threshold = featurize_most_common_significant_keywords(train_data, test_data, threshold = threshold, number_of_features = 1000)
    X_train, X_validation, y_train, y_validation = generate_train_test(train_data)
    rfc_predict, rfc = random_forest_classification(X_train, y_train, X_validation)
    evaluate(threshold, X_validation, y_validation, rfc_predict)


----------------------------------------------


AttributeError: 'DataFrame' object has no attribute 'target'

In [185]:
test_data.iloc[:,4:].values

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [191]:
test_data = test_data.iloc[:,:4]

In [195]:
train_data

Unnamed: 0,id,keyword,location,text,target,and,you,my,with,that,...,Can't,Muslims,SO,public,book,dont,reason,ain't,following,Tote
0,1,0,0,Our Deeds are the Reason of this #earthquake M...,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,4,0,0,Forest fire near La Ronge Sask. Canada,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,0,0,All residents asked to 'shelter in place' are ...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,0,0,"13,000 people receive #wildfires evacuation or...",1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,0,0,Just got sent this photo from Ruby #Alaska as ...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,0,0,Two giant cranes holding a bridge collapse int...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7609,10870,0,0,@aria_ahrary @TheTawniest The out of control w...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7610,10871,0,0,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7611,10872,0,0,Police investigating after an e-bike collided ...,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [207]:
for feature in train_data.iloc[:,5:].columns:
    test_data[feature] = 0
    idx = test_data[test_data.text.str.contains(feature, case = False, regex = False)].index
    test_data.loc[idx, feature] = 1


In [225]:
y_predict = rfc.predict(test_data.iloc[:,4:].values)
test_data['target'] = y_predict

ValueError: Number of features of the model must match the input. Model n_features is 728 and input n_features is 729 

In [228]:
test_data['target'] =y_predict

In [229]:
test_data[['id','target']].set_index('id').to_csv('predict.csv')

In [116]:
y_train

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [125]:
'''train_data = train_data.iloc[:,:5]
test_data = test_data.iloc[:,:5]

from collections import Counter
number_of_features = 100
for element in Counter(" ".join(train_data["text"]).split()).most_common(number_of_features):
    value_counts = train_data[train_data.text.str.contains(element[0], case=False, regex=False)].target.value_counts()
    threshold = 1
    try:
        if (value_counts[0]/value_counts[1] > (1 + threshold)) or (value_counts[1]/value_counts[0] > (1 + threshold)):
            try:
                keyword_feature_generator(element[0], element[0])
                keyword_feature_generator(element[0], element[0], test_data)
            except:
                pass
    except:
        pass

train_data.fillna(0, inplace = True)
test_data.fillna(0, inplace = True)'''

from sklearn.ensemble import RandomForestClassifier
# random forest model creation
rfc = RandomForestClassifier()

y = train_data.target.values
X = train_data.iloc[:,5:].values

from sklearn.model_selection import train_test_split
# implementing train-test-split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.33, random_state=66)

rfc.fit(X_train,y_train)

rfc_predict = rfc.predict(X_validation)


from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
print('threshold : ', threshold)
print('Precision : ', precision_score(y_validation, rfc_predict))
print('Recall : ', recall_score(y_validation, rfc_predict))
print('Accuracy : ', accuracy_score(y_validation, rfc_predict))
print('F1_score : ', f1_score(y_validation, rfc_predict))

threshold :  0.5
Precision :  0.0
Recall :  0.0
Accuracy :  0.5670513330680461
F1_score :  0.0


In [81]:
X_train, X_validation, y_train, y_validation = generate_train_test(train_data)
rfc_predict = random_forest_classification(X_train, y_train, X_validation)

evaluate(threshold, X_validation, y_validation, rfc_predict)


threshold :  1
features :  28
Precision :  0.5930047694753577
Recall :  0.5166204986149584
Accuracy :  0.602757715036113
F1_score :  0.5521835677276092


array([0, 0, 0, ..., 1, 1, 0], dtype=int64)