NLP tutorial

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

A quick look at data

In [4]:
# a non-disaster example
train_df[train_df["target"] == 0]["text"].values[1]

'I love fruits'

In [5]:
# a disaater example
train_df[train_df["target"] == 1]["text"].values[1]

'Forest fire near La Ronge Sask. Canada'

Building vectors

In [6]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [7]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [8]:
# otherwise
print(example_train_vectors[0].shape)
print(example_train_vectors[0])

(1, 54)
  (0, 34)	1
  (0, 12)	1
  (0, 5)	1
  (0, 49)	1
  (0, 39)	1
  (0, 29)	1
  (0, 50)	1
  (0, 13)	1
  (0, 25)	1
  (0, 4)	1
  (0, 18)	1
  (0, 52)	1
  (0, 3)	1


In [9]:
# create vectors for all of our tweets
train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

Our model - Linear connection

In [10]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [11]:
#cross validation
#The metric for this competition is F1
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.59453669, 0.56498283, 0.64082434])

score roughly 0.65 on the leaderboard 

Further improve with TFIDF, LSA, LSTM / RNNs

In [13]:
#In the meantime, let's do predictions on our training set and build a submission for the competition.
clf.fit(train_vectors, train_df["target"])

RidgeClassifier()

Prediction

In [15]:
sample_submission = pd.read_csv("sample_submission.csv")

In [16]:
sample_submission["target"] = clf.predict(test_vectors)

In [17]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [18]:
# write to local
sample_submission.to_csv("submission.csv", index=False)