In [35]:
import numpy as np
import pandas as pd

df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
print(f"Rows in train.csv = {len(df_train)}")
print(f"Rows in test.csv = {len(df_test)}")
pd.set_option('display.max_colwidth', None)
df_train.head()

Rows in train.csv = 7613
Rows in test.csv = 3263


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [36]:
df_train_pos = df_train[df_train.target == 1]
df_train_neg = df_train[df_train.target == 0]
print(f"No. of positive training examples = {len(df_train_pos)}")
print(f"No. of negative training examples = {len(df_train_neg)}")
train_keywords_unique = df_train.keyword.unique()
print(f"No. of unique keywords = {len(train_keywords_unique)}")
df_train_notnull_keywords = df_train[~df_train.keyword.isnull()]
print(f"No of train examples with keyword not null = {len(df_train_notnull_keywords)}")
df_train_notnull_keywords.head()

No. of positive training examples = 3271
No. of negative training examples = 4342
No. of unique keywords = 222
No of train examples with keyword not null = 7552


Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT http://t.co/YAo1e0xngw,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE http://t.co/qqsmshaJ3N,0


In [37]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


Preprocess the tweets 

In [38]:
from common_utils import process_tweet

df_train['text_processed'] = df_train['text'].apply(lambda text: " ".join(process_tweet(text)))
df_train.head()

Unnamed: 0,id,keyword,location,text,target,text_processed
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,resident asked shelter place notified officer evacuation shelter place order expected
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,"13,000 people receive wildfire evacuation order california"
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,got sent photo ruby alaska smoke wildfire pours school


In [39]:
df_test['text_processed'] = df_test['text'].apply(lambda text: " ".join(process_tweet(text)))
df_test.head()

Unnamed: 0,id,keyword,location,text,text_processed
0,0,,,Just happened a terrible car crash,happened terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone.",heard earthquake different city stay safe everyone
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",forest fire spot pond goose fleeing across street cannot save
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill 28 china taiwan


Build the frequency dictionary for positive and negative words in tweets ( positive means target value is 1 and negative means 0 )

In [40]:
# from common_utils import build_freqs
import numpy as np

def add_or_increment(key, dict):
    if key in dict:
        dict[key] += 1
    else:
        dict[key] = 1       

def build_freqs(processed_tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    pos_freqs = {}
    neg_freqs = {}
    for y, tweet in zip(yslist, processed_tweets):        
        for word in tweet.split():
            pair = (word, y)
            add_or_increment(pair, freqs)
            if y == 1:
                add_or_increment(word, pos_freqs)
            else:
                add_or_increment(word, neg_freqs)
    return freqs, pos_freqs, neg_freqs

X_input_tweets = df_train['text_processed']
X_test_tweets = df_test['text_processed']
y_input = df_train['target']
freqs, pos_word_freqs, neg_word_freqs = build_freqs(X_input_tweets, y_input)


In [41]:
from common_utils import get_topn_dictitems_byvalue

print('10 most popular words in disaster tweets are:')
for item in get_topn_dictitems_byvalue(pos_word_freqs, 10):
    print(f"'{item[1]}' with a count of {item[0]}")

print('10 most popular words in non disaster tweets are:')
for item in get_topn_dictitems_byvalue(neg_word_freqs, 10):
    print(f"'{item[1]}' with a count of {item[0]}")


10 most popular words in disaster tweets are:
'' with a count of 345
'fire' with a count of 261
'û_' with a count of 147
'disaster' with a count of 119
'news' with a count of 118
'california' with a count of 110
'suicide' with a count of 109
'police' with a count of 107
'people' with a count of 106
'family' with a count of 102
10 most popular words in non disaster tweets are:
'' with a count of 374
'like' with a count of 255
'i'm' with a count of 202
'get' with a count of 183
'new' with a count of 161
'u' with a count of 145
'û_' with a count of 139
'one' with a count of 133
'body' with a count of 113
'time' with a count of 103


In [42]:
def build_keyword_freq(keywords, ys):
    pos_keyword_freqs = {}
    neg_keyword_freqs = {}
    for y, keyword in zip(ys, keywords):
        if y == 1:
            add_or_increment(keyword, pos_keyword_freqs)
        else:
            add_or_increment(keyword, neg_keyword_freqs)
    return pos_keyword_freqs, neg_keyword_freqs

df_train_notnull_location = df_train[~df_train['location'].isnull()]
pos_keyword_freqs, neg_keyword_freqs = build_keyword_freq(df_train_notnull_keywords['keyword'], df_train_notnull_keywords['target'])
pos_location_freqs, neg_location_freqs = build_keyword_freq(df_train_notnull_location['location'], df_train_notnull_location['target'])

In [43]:
print('10 most popular location in disaster tweets are:')
for item in get_topn_dictitems_byvalue(pos_keyword_freqs, 10):
    print(f"'{item[1]}' with a count of {item[0]}")

print('10 most popular location in non disaster tweets are:')
for item in get_topn_dictitems_byvalue(neg_keyword_freqs, 10):
    print(f"'{item[1]}' with a count of {item[0]}")


10 most popular location in disaster tweets are:
'derailment' with a count of 39
'outbreak' with a count of 39
'wreckage' with a count of 39
'debris' with a count of 37
'oil%20spill' with a count of 37
'typhoon' with a count of 37
'evacuated' with a count of 32
'rescuers' with a count of 32
'suicide%20bomb' with a count of 32
'suicide%20bombing' with a count of 32
10 most popular location in non disaster tweets are:
'body%20bags' with a count of 40
'armageddon' with a count of 37
'harm' with a count of 37
'deluge' with a count of 36
'ruin' with a count of 36
'wrecked' with a count of 36
'explode' with a count of 35
'fear' with a count of 35
'siren' with a count of 35
'twister' with a count of 35


In [44]:
print('10 most popular keywords in disaster tweets are:')
for item in get_topn_dictitems_byvalue(pos_location_freqs, 10):
    print(f"'{item[1]}' with a count of {item[0]}")

print('10 most popular keywords in non disaster tweets are:')
for item in get_topn_dictitems_byvalue(neg_location_freqs, 10):
    print(f"'{item[1]}' with a count of {item[0]}")

10 most popular keywords in disaster tweets are:
'USA' with a count of 67
'United States' with a count of 27
'Nigeria' with a count of 22
'India' with a count of 20
'Mumbai' with a count of 19
'UK' with a count of 16
'London' with a count of 16
'New York' with a count of 16
'Washington, DC' with a count of 15
'Canada' with a count of 13
10 most popular keywords in non disaster tweets are:
'New York' with a count of 55
'USA' with a count of 37
'London' with a count of 29
'United States' with a count of 23
'Los Angeles, CA' with a count of 18
'Canada' with a count of 16
'Kenya' with a count of 15
'Everywhere' with a count of 12
'UK' with a count of 11
'Florida' with a count of 11


In [45]:
def extract_features(tweet, keyword, loc, freqs, pos_keyword_freqs, neg_keyword_freqs, 
                    pos_loc_freqs, neg_loc_freqs):
    tweet_words = process_tweet(tweet)
    # first column is bias, second positive word count, third negative word count
    feature_vector = np.zeros((1, 7))
    #feature_vector = np.zeros((1, 3))
    feature_vector[0, 0] = 1
    #print(f"keyword: {keyword}")
    for word in tweet_words:
        feature_vector[0, 1] += freqs.get((word, 1.0), 0)        
        feature_vector[0, 2] += freqs.get((word, 0.0), 0)
    if not pd.isnull(keyword):   
        #print(keyword)       
        if keyword in pos_keyword_freqs.keys():  
            feature_vector[0, 3] += pos_keyword_freqs[keyword]
        if keyword in neg_keyword_freqs.keys():            
            feature_vector[0, 4] += neg_keyword_freqs[keyword]
    if not pd.isnull(loc):   
        #print(keyword)       
        if loc in pos_loc_freqs.keys():  
            feature_vector[0, 5] += pos_loc_freqs[loc]
        if loc in neg_loc_freqs.keys():            
            feature_vector[0, 6] += neg_loc_freqs[loc]            
    return feature_vector

In [46]:
X_input_keywords = df_train['keyword']

In [49]:
test_features = extract_features(X_input_tweets[0], 'aftershock', 'usa', freqs, 
                pos_keyword_freqs, neg_keyword_freqs, pos_location_freqs, neg_location_freqs)
test_features

array([[  1., 205., 215.,   0.,  34.,   0.,   0.]])

In [54]:
#X_input = np.zeros((len(X_input_tweets), 5))
X_input = np.zeros((len(X_input_tweets), 7))
for i, row in df_train.iterrows():
    X_input[i, :] = extract_features(row['text_processed'], row['keyword'], row['location'], 
                            freqs, pos_keyword_freqs, neg_keyword_freqs, pos_location_freqs, neg_location_freqs)

print(X_input.shape)

#X_test = np.zeros((len(X_test_tweets), 5))
X_test = np.zeros((len(X_test_tweets), 7))
for i, row in df_test.iterrows():
    X_test[i, :] = extract_features(row['text_processed'], row['keyword'], row['location'], 
                            freqs, pos_keyword_freqs, neg_keyword_freqs, pos_location_freqs, neg_location_freqs)
    
print(X_test.shape)    

(7613, 7)
(3263, 7)


In [55]:
input_df = pd.DataFrame(X_input)
input_df.to_csv('input.csv')

Split training data into 80% training set and 20% validation set

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_val, y_train, y_val = train_test_split(X_input, y_input, test_size=0.2, random_state=42)
#X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [56]:
from sklearn.naive_bayes import ComplementNB
mnb = ComplementNB(alpha=1)
mnb.fit(X_input, y_input.ravel())

ComplementNB(alpha=1)

In [57]:
from sklearn.model_selection import cross_validate
cv_results = cross_validate(mnb, X_input, y_input, cv=5, scoring="f1")
cv_results['test_score'].mean()

0.7765263607720169

In [None]:
#mnb.score(X_val, y_val)

In [None]:
#y_val_pred = mnb.predict(X_val)

In [None]:
#from sklearn.metrics import classification_report
#print(classification_report(y_val, y_val_pred))

In [58]:
y_test_pred = mnb.predict(X_test)
df_test['target'] = y_test_pred
df_test.head(20)

Unnamed: 0,id,keyword,location,text,text_processed,target
0,0,,,Just happened a terrible car crash,happened terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, stay safe everyone.",heard earthquake different city stay safe everyone,1
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",forest fire spot pond goose fleeing across street cannot save,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill 28 china taiwan,1
5,12,,,We're shaking...It's an earthquake,we're shakingit's earthquake,1
6,21,,,"They'd probably still show more life than Arsenal did yesterday, eh? EH?",they'd probably still show life arsenal yesterday eh eh,0
7,22,,,Hey! How are you?,hey,0
8,27,,,What a nice hat?,nice hat,0
9,29,,,Fuck off!,fuck,0


In [59]:
df_submit = df_test[['id', 'target']]
df_submit.to_csv('results.csv', index=False)

In [60]:
df_train.to_csv('train_processed1.csv')