## Load Data

In [10]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [11]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

print(df_train.describe())
print(df_train.head())

                 id      target
count   7613.000000  7613.00000
mean    5441.934848     0.42966
std     3137.116090     0.49506
min        1.000000     0.00000
25%     2734.000000     0.00000
50%     5408.000000     0.00000
75%     8146.000000     1.00000
max    10873.000000     1.00000
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [12]:
df_train[df_train['target'] == 0]['text'].values[0]

"What's up man?"

In [13]:
df_train[df_train['target'] == 1]['text'].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [14]:
df_train['text'].values[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

## Create Features

In [15]:
# let's get counts for the first 5 tweets in the data
count_vectorizer = feature_extraction.text.CountVectorizer()
example_train_vectors = count_vectorizer.fit_transform(df_train["text"][0:5])

## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [16]:
# let's get counts for all tweets in the data
count_vectorizer = feature_extraction.text.CountVectorizer()
train_vectors = count_vectorizer.fit_transform(df_train['text'])
test_vectors = count_vectorizer.transform(df_test['text'])

print(train_vectors[0].todense().shape)
print(test_vectors[0].todense().shape)

(1, 21637)
(1, 21637)


## Train Models

In [28]:
import xgboost
print(xgboost.__version__)
from xgboost import XGBClassifier

1.3.3


In [35]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
#clf = linear_model.RidgeClassifier()
clf = XGBClassifier(objective="reg:squarederror", random_state=42)

In [36]:
scores = model_selection.cross_val_score(XGBClassifier(objective='reg:squarederror'), train_vectors, df_train['target'], cv=3, scoring='f1')
print(scores)



[0.59610028 0.54219949 0.63914522]


In [38]:
clf.fit(train_vectors, df_train["target"])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=8, num_parallel_tree=1,
       objective='reg:squarederror', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       use_label_encoder=True, validate_parameters=1, verbosity=None)

In [39]:
sample_submission = pd.read_csv('sample_submission.csv')

In [40]:
sample_submission['target'] = clf.predict(test_vectors)
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [41]:
sample_submission.to_csv('submission.csv', index=False)