In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [3]:
train_data[train_data['target'] == 1]['text'].sample().values

array(['Me pulling over and fighting the hoes that called Zayn a terrorist  http://t.co/FY30fV0Qbx'],
      dtype=object)

In [4]:
train_data[train_data.text.str.contains('', case=False)].target.value_counts()


0    4342
1    3271
Name: target, dtype: int64

##### True Positive tweets tend to result in multiple tweets when searched in twitter

## Define keyword feature generator method

In [3]:
def keyword_feature_generator(keyword, feature_name, data = train_data):
    data[feature_name] = 0
    idx = data[data.text.str.contains(keyword, case=False)].index
    train_data.loc[idx, feature_name] = 1

## Add I sentence feature

In [7]:
keyword = 'i '
keyword_feature_generator(keyword, 'i_sentence')
keyword_feature_generator(keyword, 'i_sentence', test_data)

## Add http feature

In [8]:
keyword = 'http'
keyword_feature_generator(keyword, 'link_attached')
keyword_feature_generator(keyword, 'link_attached', test_data)

## Add @ feature

In [9]:
train_data[train_data.text.str.contains('@')].target.value_counts() 

0    1363
1     676
Name: target, dtype: int64

In [10]:
keyword = '@'
keyword_feature_generator(keyword, 'at_in_text')
keyword_feature_generator(keyword, 'at_in_text', test_data)

## Add news feature

In [11]:
keyword = "news"
keyword_feature_generator(keyword, 'news_in_text')
keyword_feature_generator(keyword, 'news_in_text', test_data)

## Take the most common 1000 keywords as feature

In [5]:
from collections import Counter
for element in Counter(" ".join(train_data["text"]).split()).most_common(100):
    try:
        keyword_feature_generator(element[0], element[0])
        keyword_feature_generator(element[0], element[0], test_data)

    except:
        pass

In [47]:
train_data = train_data.iloc[:,:5]
test_data = test_data.iloc[:,:5]

In [36]:
from collections import Counter
number_of_features = 1000
for element in Counter(" ".join(train_data["text"]).split()).most_common(number_of_features):
    value_counts = train_data[train_data.text.str.contains(element[0], case=False, regex=False)].target.value_counts()
    threshold = 0.5
    try:
        if (value_counts[1]/value_counts[0] > (1 + threshold)) or (value_counts[1]/value_counts[0] < (1 - threshold)):
            try:
                keyword_feature_generator(element[0], element[0])
                keyword_feature_generator(element[0], element[0], test_data)
            except:
                pass
    except:
        pass


In [46]:
value_counts[0]/value_counts[1]

8.0

In [37]:
train_data.fillna(0, inplace = True)
test_data.fillna(0, inplace = True)

## Random Forest Classifier

In [38]:
from sklearn.ensemble import RandomForestClassifier
# random forest model creation
rfc = RandomForestClassifier()


In [39]:
y = train_data.target.values
X = train_data.iloc[:,5:].values

from sklearn.model_selection import train_test_split
# implementing train-test-split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.33, random_state=66)

In [40]:
rfc.fit(X_train,y_train)
# predictions
rfc_predict = rfc.predict(X_validation)

In [41]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

In [42]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
print('threshold : ', threshold)
print('Precision : ', precision_score(y_validation, rfc_predict))
print('Recall : ', recall_score(y_validation, rfc_predict))
print('Accuracy : ', accuracy_score(y_validation, rfc_predict))
print('F1_score : ', f1_score(y_validation, rfc_predict))

threshold :  0.2
Precision :  0.7554479418886199
Recall :  0.5735294117647058
Accuracy :  0.7349781138081973
F1_score :  0.6520376175548588


In [44]:
train_data = train_data.iloc[:,:5]
test_data = test_data.iloc[:,:5]

from collections import Counter
number_of_features = 1000
for element in Counter(" ".join(train_data["text"]).split()).most_common(number_of_features):
    value_counts = train_data[train_data.text.str.contains(element[0], case=False, regex=False)].target.value_counts()
    threshold = 0.1
    try:
        if (value_counts[1]/value_counts[0] > (1 + threshold)) or (value_counts[1]/value_counts[0] < (1 - threshold)):
            try:
                keyword_feature_generator(element[0], element[0])
                keyword_feature_generator(element[0], element[0], test_data)
            except:
                pass
    except:
        pass

train_data.fillna(0, inplace = True)
test_data.fillna(0, inplace = True)

from sklearn.ensemble import RandomForestClassifier
# random forest model creation
rfc = RandomForestClassifier()

y = train_data.target.values
X = train_data.iloc[:,5:].values

from sklearn.model_selection import train_test_split
# implementing train-test-split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.33, random_state=66)

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
print('threshold : ', threshold)
print('Precision : ', precision_score(y_validation, rfc_predict))
print('Recall : ', recall_score(y_validation, rfc_predict))
print('Accuracy : ', accuracy_score(y_validation, rfc_predict))
print('F1_score : ', f1_score(y_validation, rfc_predict))

threshold :  0.1
Precision :  0.7554479418886199
Recall :  0.5735294117647058
Accuracy :  0.7349781138081973
F1_score :  0.6520376175548588
