# Disaster Tweets: Simple Linear Model

Greetings! Thanks for checking out my code. 

In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)
stopwords = stopwords.words('english')
import re
import string
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

SEED = 42

sns.set(style="white", font_scale=1.2)
plt.rcParams["figure.figsize"] = [10,8]
pd.set_option.display_max_columns = 0
pd.set_option.display_max_rows = 0

In [3]:
train = pd.read_csv("assets/train.csv")
test = pd.read_csv("assets/test.csv")

**Data Cleaning Functions**

Many thanks to https://www.kaggle.com/shahules/basic-eda-cleaning-and-glove#Data-Cleaning.

In [4]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html_tags(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

# See https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punctuation(text):
    punc = str.maketrans('', '', string.punctuation)
    return text.translate(punc)

In [5]:
for dataset in [train, test]:
    dataset["text"] = dataset["text"].apply(remove_url).apply(remove_html_tags).apply(remove_emoji).apply(remove_punctuation)

**Building the Vectors**

In [6]:
count_vectorizer = feature_extraction.text.CountVectorizer()

ex_tweets = count_vectorizer.fit_transform(train["text"][0:5])

In [10]:
print(ex_tweets[0].todense().shape)
print(ex_tweets[0].todense())

(1, 53)
[[0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0
  0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [12]:
train_vectors = count_vectorizer.fit_transform(train["text"])
test_vectors = count_vectorizer.transform(test["text"])

In [32]:
clf = linear_model.RidgeClassifier(random_state=SEED)

In [35]:
scores = model_selection.cross_val_score(clf, train_vectors, train["target"], cv=3, scoring="f1")
scores

array([0.62001037, 0.55512195, 0.61985472])

In [34]:
clf.fit(train_vectors, train["target"])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=42, solver='auto',
        tol=0.001)

**Generate a Submission**

In [20]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,Heard about earthquake is different cities sta...
2,3,,,there is a forest fire at spot pond geese are ...
3,9,,,Apocalypse lighting Spokane wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [36]:
def generate_submission(model, test_vectors, save=False):
    preds = model.predict(test_vectors)
    output = pd.DataFrame()
    output["id"] = test["id"]
    output["target"] = preds
    if save:
        output.to_csv("submissions/baseline-linear-model.csv", index=False)
    return output

In [26]:
output = generate_submission(clf, test_vectors, True)
output.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1


### Conclusion

Thanks very much for reading; I hope you enjoyed learning how to build a linear model using vectorized NLP data. If you did, be sure to upvote so you can find this notebook again easily in your Favorites tab.

Until next time, happy coding :)